Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 73f51a4d

History | View | Annotate | Download (138 KB)

1 de6d9b64 Fabrice Bellard
/*
2
 * MMX optimized DSP utils
3 ff4ec49e Fabrice Bellard
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 8f2ab833 Michael Niedermayer
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 de6d9b64 Fabrice Bellard
 *
6 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9 ff4ec49e Fabrice Bellard
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
12 de6d9b64 Fabrice Bellard
 *
13 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
14 de6d9b64 Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ff4ec49e Fabrice Bellard
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17 de6d9b64 Fabrice Bellard
 *
18 ff4ec49e Fabrice Bellard
 * You should have received a copy of the GNU Lesser General Public
19 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
20 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 de6d9b64 Fabrice Bellard
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24
25 b550bfaa Ronald S. Bultje
#include "dsputil.h"
26
#include "simple_idct.h"
27
#include "mpegvideo.h"
28 9c39071d Luca Abeni
#include "x86_cpu.h"
29 f9ed9d85 Mike Melanson
#include "mmx.h"
30 5b0b7054 Aurelien Jacobs
#include "vp3dsp_mmx.h"
31
#include "vp3dsp_sse2.h"
32 eb75a698 Aurelien Jacobs
#include "h263.h"
33 de6d9b64 Fabrice Bellard
34 622348f9 Michael Niedermayer
//#undef NDEBUG
35
//#include <assert.h>
36
37 84740d59 Michael Niedermayer
extern void ff_idct_xvid_mmx(short *block);
38
extern void ff_idct_xvid_mmx2(short *block);
39 359f98de Michael Niedermayer
40 486497e0 Måns Rullgård
int mm_flags; /* multimedia extension flags */
41 1457ab52 Michael Niedermayer
42 de6d9b64 Fabrice Bellard
/* pixel operations */
43 5c0513bd Dmitry Baryshkov
static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
44
static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
45
static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
46 de6d9b64 Fabrice Bellard
47 21bb884f Michael Niedermayer
static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
48
{0x8000000080000000ULL, 0x8000000080000000ULL};
49
50 5c0513bd Dmitry Baryshkov
static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
51
static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
52 42251a2a Loren Merritt
static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
53 437525c4 Michael Niedermayer
static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
54 a6624e21 Loren Merritt
static const uint64_t ff_pw_8  attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
55 5c0513bd Dmitry Baryshkov
static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
56 437525c4 Michael Niedermayer
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
57 d2bb7db1 Loren Merritt
static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
58 5c0513bd Dmitry Baryshkov
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
59 826f429a Michael Niedermayer
60 3e20143e Loren Merritt
static const uint64_t ff_pb_1  attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
61
static const uint64_t ff_pb_3  attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
62
static const uint64_t ff_pb_7  attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
63 5cf08f23 Loren Merritt
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
64 e9f1885c Michael Niedermayer
static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
65
static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
66 5c0513bd Dmitry Baryshkov
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
67 359f98de Michael Niedermayer
68 4454dc1b John Dalgliesh
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
69 d6a4c0b1 Zdenek Kabelac
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
70
71 fca0f0e5 Zdenek Kabelac
#define MOVQ_WONE(regd) \
72
    __asm __volatile ( \
73
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
74
    "psrlw $15, %%" #regd ::)
75
76
#define MOVQ_BFE(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
79
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
80
81 d6a4c0b1 Zdenek Kabelac
#ifndef PIC
82 fca0f0e5 Zdenek Kabelac
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
83 d6a4c0b1 Zdenek Kabelac
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
84
#else
85
// for shared library it's better to use this way for accessing constants
86
// pcmpeqd -> -1
87 fca0f0e5 Zdenek Kabelac
#define MOVQ_BONE(regd) \
88 d6a4c0b1 Zdenek Kabelac
    __asm __volatile ( \
89 fca0f0e5 Zdenek Kabelac
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
90
    "psrlw $15, %%" #regd " \n\t" \
91
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
92 d6a4c0b1 Zdenek Kabelac
93
#define MOVQ_WTWO(regd) \
94
    __asm __volatile ( \
95 fca0f0e5 Zdenek Kabelac
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
96
    "psrlw $15, %%" #regd " \n\t" \
97
    "psllw $1, %%" #regd " \n\t"::)
98 a7bd8797 Michael Niedermayer
99 d6a4c0b1 Zdenek Kabelac
#endif
100
101 fca0f0e5 Zdenek Kabelac
// using regr as temporary and for the output result
102 def60345 Zdenek Kabelac
// first argument is unmodifed and second is trashed
103 39825f31 Zdenek Kabelac
// regfe is supposed to contain 0xfefefefefefefefe
104
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
105 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
106
    "pand " #regb ", " #regr "  \n\t"\
107
    "pxor " #rega ", " #regb "  \n\t"\
108
    "pand " #regfe "," #regb "  \n\t"\
109
    "psrlq $1, " #regb "        \n\t"\
110
    "paddb " #regb ", " #regr " \n\t"
111 def60345 Zdenek Kabelac
112 39825f31 Zdenek Kabelac
#define PAVGB_MMX(rega, regb, regr, regfe) \
113 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
114
    "por  " #regb ", " #regr "  \n\t"\
115
    "pxor " #rega ", " #regb "  \n\t"\
116
    "pand " #regfe "," #regb "  \n\t"\
117
    "psrlq $1, " #regb "        \n\t"\
118
    "psubb " #regb ", " #regr " \n\t"
119 def60345 Zdenek Kabelac
120 39825f31 Zdenek Kabelac
// mm6 is supposed to contain 0xfefefefefefefefe
121 6aa6ea8e Zdenek Kabelac
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
122 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
123
    "movq " #regc ", " #regp "  \n\t"\
124
    "pand " #regb ", " #regr "  \n\t"\
125
    "pand " #regd ", " #regp "  \n\t"\
126
    "pxor " #rega ", " #regb "  \n\t"\
127
    "pxor " #regc ", " #regd "  \n\t"\
128
    "pand %%mm6, " #regb "      \n\t"\
129
    "pand %%mm6, " #regd "      \n\t"\
130
    "psrlq $1, " #regb "        \n\t"\
131
    "psrlq $1, " #regd "        \n\t"\
132
    "paddb " #regb ", " #regr " \n\t"\
133
    "paddb " #regd ", " #regp " \n\t"
134 6aa6ea8e Zdenek Kabelac
135
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
136 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
137
    "movq " #regc ", " #regp "  \n\t"\
138
    "por  " #regb ", " #regr "  \n\t"\
139
    "por  " #regd ", " #regp "  \n\t"\
140
    "pxor " #rega ", " #regb "  \n\t"\
141
    "pxor " #regc ", " #regd "  \n\t"\
142
    "pand %%mm6, " #regb "      \n\t"\
143
    "pand %%mm6, " #regd "      \n\t"\
144
    "psrlq $1, " #regd "        \n\t"\
145
    "psrlq $1, " #regb "        \n\t"\
146
    "psubb " #regb ", " #regr " \n\t"\
147
    "psubb " #regd ", " #regp " \n\t"
148 6aa6ea8e Zdenek Kabelac
149 91abb473 Zdenek Kabelac
/***********************************/
150
/* MMX no rounding */
151
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
152 fca0f0e5 Zdenek Kabelac
#define SET_RND  MOVQ_WONE
153 bb270c08 Diego Biurrun
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
154
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
155 fca0f0e5 Zdenek Kabelac
156 91abb473 Zdenek Kabelac
#include "dsputil_mmx_rnd.h"
157
158
#undef DEF
159 fca0f0e5 Zdenek Kabelac
#undef SET_RND
160 6aa6ea8e Zdenek Kabelac
#undef PAVGBP
161 39825f31 Zdenek Kabelac
#undef PAVGB
162 91abb473 Zdenek Kabelac
/***********************************/
163
/* MMX rounding */
164
165
#define DEF(x, y) x ## _ ## y ##_mmx
166 fca0f0e5 Zdenek Kabelac
#define SET_RND  MOVQ_WTWO
167 bb270c08 Diego Biurrun
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
168
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
169 fca0f0e5 Zdenek Kabelac
170 91abb473 Zdenek Kabelac
#include "dsputil_mmx_rnd.h"
171
172
#undef DEF
173 fca0f0e5 Zdenek Kabelac
#undef SET_RND
174 6aa6ea8e Zdenek Kabelac
#undef PAVGBP
175 39825f31 Zdenek Kabelac
#undef PAVGB
176 a7bd8797 Michael Niedermayer
177 de6d9b64 Fabrice Bellard
/***********************************/
178
/* 3Dnow specific */
179
180
#define DEF(x) x ## _3dnow
181
#define PAVGB "pavgusb"
182
183
#include "dsputil_mmx_avg.h"
184
185
#undef DEF
186
#undef PAVGB
187
188
/***********************************/
189
/* MMX2 specific */
190
191 607dce96 Michael Niedermayer
#define DEF(x) x ## _mmx2
192 de6d9b64 Fabrice Bellard
193
/* Introduced only in MMX2 set */
194
#define PAVGB "pavgb"
195
196
#include "dsputil_mmx_avg.h"
197
198
#undef DEF
199
#undef PAVGB
200
201 561f940c Loren Merritt
#define SBUTTERFLY(a,b,t,n,m)\
202
    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
203 98d417cb Måns Rullgård
    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
204
    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
205
206 bba5293b Loren Merritt
#define TRANSPOSE4(a,b,c,d,t)\
207 561f940c Loren Merritt
    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
208
    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
209
    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
210
    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
211 bba5293b Loren Merritt
212 de6d9b64 Fabrice Bellard
/***********************************/
213
/* standard MMX */
214
215 764ef400 Mike Melanson
#ifdef CONFIG_ENCODERS
216 0c1a9eda Zdenek Kabelac
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
217 de6d9b64 Fabrice Bellard
{
218 607dce96 Michael Niedermayer
    asm volatile(
219 bb270c08 Diego Biurrun
        "mov $-128, %%"REG_a"           \n\t"
220
        "pxor %%mm7, %%mm7              \n\t"
221 4454dc1b John Dalgliesh
        ASMALIGN(4)
222 bb270c08 Diego Biurrun
        "1:                             \n\t"
223
        "movq (%0), %%mm0               \n\t"
224
        "movq (%0, %2), %%mm2           \n\t"
225
        "movq %%mm0, %%mm1              \n\t"
226
        "movq %%mm2, %%mm3              \n\t"
227
        "punpcklbw %%mm7, %%mm0         \n\t"
228
        "punpckhbw %%mm7, %%mm1         \n\t"
229
        "punpcklbw %%mm7, %%mm2         \n\t"
230
        "punpckhbw %%mm7, %%mm3         \n\t"
231
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
232
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
233
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
234
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
235
        "add %3, %0                     \n\t"
236
        "add $32, %%"REG_a"             \n\t"
237
        "js 1b                          \n\t"
238 607dce96 Michael Niedermayer
        : "+r" (pixels)
239 053dea12 Aurelien Jacobs
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
240
        : "%"REG_a
241 607dce96 Michael Niedermayer
    );
242 de6d9b64 Fabrice Bellard
}
243
244 0c1a9eda Zdenek Kabelac
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
245 9dbcbd92 Michael Niedermayer
{
246
    asm volatile(
247 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7              \n\t"
248
        "mov $-128, %%"REG_a"           \n\t"
249 4454dc1b John Dalgliesh
        ASMALIGN(4)
250 bb270c08 Diego Biurrun
        "1:                             \n\t"
251
        "movq (%0), %%mm0               \n\t"
252
        "movq (%1), %%mm2               \n\t"
253
        "movq %%mm0, %%mm1              \n\t"
254
        "movq %%mm2, %%mm3              \n\t"
255
        "punpcklbw %%mm7, %%mm0         \n\t"
256
        "punpckhbw %%mm7, %%mm1         \n\t"
257
        "punpcklbw %%mm7, %%mm2         \n\t"
258
        "punpckhbw %%mm7, %%mm3         \n\t"
259
        "psubw %%mm2, %%mm0             \n\t"
260
        "psubw %%mm3, %%mm1             \n\t"
261
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
262
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
263
        "add %3, %0                     \n\t"
264
        "add %3, %1                     \n\t"
265
        "add $16, %%"REG_a"             \n\t"
266
        "jnz 1b                         \n\t"
267 9dbcbd92 Michael Niedermayer
        : "+r" (s1), "+r" (s2)
268 053dea12 Aurelien Jacobs
        : "r" (block+64), "r" ((long)stride)
269
        : "%"REG_a
270 9dbcbd92 Michael Niedermayer
    );
271
}
272 764ef400 Mike Melanson
#endif //CONFIG_ENCODERS
273 9dbcbd92 Michael Niedermayer
274 0c1a9eda Zdenek Kabelac
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
275 de6d9b64 Fabrice Bellard
{
276
    const DCTELEM *p;
277 0c1a9eda Zdenek Kabelac
    uint8_t *pix;
278 de6d9b64 Fabrice Bellard
279
    /* read the pixels */
280
    p = block;
281
    pix = pixels;
282 d6a4c0b1 Zdenek Kabelac
    /* unrolled loop */
283 bb270c08 Diego Biurrun
        __asm __volatile(
284
                "movq   %3, %%mm0               \n\t"
285
                "movq   8%3, %%mm1              \n\t"
286
                "movq   16%3, %%mm2             \n\t"
287
                "movq   24%3, %%mm3             \n\t"
288
                "movq   32%3, %%mm4             \n\t"
289
                "movq   40%3, %%mm5             \n\t"
290
                "movq   48%3, %%mm6             \n\t"
291
                "movq   56%3, %%mm7             \n\t"
292
                "packuswb %%mm1, %%mm0          \n\t"
293
                "packuswb %%mm3, %%mm2          \n\t"
294
                "packuswb %%mm5, %%mm4          \n\t"
295
                "packuswb %%mm7, %%mm6          \n\t"
296
                "movq   %%mm0, (%0)             \n\t"
297
                "movq   %%mm2, (%0, %1)         \n\t"
298
                "movq   %%mm4, (%0, %1, 2)      \n\t"
299
                "movq   %%mm6, (%0, %2)         \n\t"
300
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
301
                :"memory");
302 de6d9b64 Fabrice Bellard
        pix += line_size*4;
303
        p += 32;
304 d6a4c0b1 Zdenek Kabelac
305
    // if here would be an exact copy of the code above
306
    // compiler would generate some very strange code
307
    // thus using "r"
308
    __asm __volatile(
309 bb270c08 Diego Biurrun
            "movq       (%3), %%mm0             \n\t"
310
            "movq       8(%3), %%mm1            \n\t"
311
            "movq       16(%3), %%mm2           \n\t"
312
            "movq       24(%3), %%mm3           \n\t"
313
            "movq       32(%3), %%mm4           \n\t"
314
            "movq       40(%3), %%mm5           \n\t"
315
            "movq       48(%3), %%mm6           \n\t"
316
            "movq       56(%3), %%mm7           \n\t"
317
            "packuswb %%mm1, %%mm0              \n\t"
318
            "packuswb %%mm3, %%mm2              \n\t"
319
            "packuswb %%mm5, %%mm4              \n\t"
320
            "packuswb %%mm7, %%mm6              \n\t"
321
            "movq       %%mm0, (%0)             \n\t"
322
            "movq       %%mm2, (%0, %1)         \n\t"
323
            "movq       %%mm4, (%0, %1, 2)      \n\t"
324
            "movq       %%mm6, (%0, %2)         \n\t"
325
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
326
            :"memory");
327 de6d9b64 Fabrice Bellard
}
328
329 68b51e58 Steve L'Homme
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
330 7daabccb Mike Melanson
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
331
332 f9ed9d85 Mike Melanson
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
333
{
334
    int i;
335
336
    movq_m2r(*vector128, mm1);
337
    for (i = 0; i < 8; i++) {
338
        movq_m2r(*(block), mm0);
339
        packsswb_m2r(*(block + 4), mm0);
340
        block += 8;
341
        paddb_r2r(mm1, mm0);
342
        movq_r2m(mm0, *pixels);
343
        pixels += line_size;
344
    }
345
}
346
347 0c1a9eda Zdenek Kabelac
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
348 de6d9b64 Fabrice Bellard
{
349
    const DCTELEM *p;
350 0c1a9eda Zdenek Kabelac
    uint8_t *pix;
351 de6d9b64 Fabrice Bellard
    int i;
352
353
    /* read the pixels */
354
    p = block;
355
    pix = pixels;
356 d6a4c0b1 Zdenek Kabelac
    MOVQ_ZERO(mm7);
357
    i = 4;
358 cd8e5f96 Zdenek Kabelac
    do {
359 bb270c08 Diego Biurrun
        __asm __volatile(
360
                "movq   (%2), %%mm0     \n\t"
361
                "movq   8(%2), %%mm1    \n\t"
362
                "movq   16(%2), %%mm2   \n\t"
363
                "movq   24(%2), %%mm3   \n\t"
364
                "movq   %0, %%mm4       \n\t"
365
                "movq   %1, %%mm6       \n\t"
366
                "movq   %%mm4, %%mm5    \n\t"
367
                "punpcklbw %%mm7, %%mm4 \n\t"
368
                "punpckhbw %%mm7, %%mm5 \n\t"
369
                "paddsw %%mm4, %%mm0    \n\t"
370
                "paddsw %%mm5, %%mm1    \n\t"
371
                "movq   %%mm6, %%mm5    \n\t"
372
                "punpcklbw %%mm7, %%mm6 \n\t"
373
                "punpckhbw %%mm7, %%mm5 \n\t"
374
                "paddsw %%mm6, %%mm2    \n\t"
375
                "paddsw %%mm5, %%mm3    \n\t"
376
                "packuswb %%mm1, %%mm0  \n\t"
377
                "packuswb %%mm3, %%mm2  \n\t"
378
                "movq   %%mm0, %0       \n\t"
379
                "movq   %%mm2, %1       \n\t"
380
                :"+m"(*pix), "+m"(*(pix+line_size))
381
                :"r"(p)
382
                :"memory");
383 de6d9b64 Fabrice Bellard
        pix += line_size*2;
384
        p += 16;
385 cd8e5f96 Zdenek Kabelac
    } while (--i);
386 de6d9b64 Fabrice Bellard
}
387
388 437525c4 Michael Niedermayer
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
389
{
390
    __asm __volatile(
391 bb270c08 Diego Biurrun
         "lea (%3, %3), %%"REG_a"       \n\t"
392 4454dc1b John Dalgliesh
         ASMALIGN(3)
393 bb270c08 Diego Biurrun
         "1:                            \n\t"
394
         "movd (%1), %%mm0              \n\t"
395
         "movd (%1, %3), %%mm1          \n\t"
396
         "movd %%mm0, (%2)              \n\t"
397
         "movd %%mm1, (%2, %3)          \n\t"
398
         "add %%"REG_a", %1             \n\t"
399
         "add %%"REG_a", %2             \n\t"
400
         "movd (%1), %%mm0              \n\t"
401
         "movd (%1, %3), %%mm1          \n\t"
402
         "movd %%mm0, (%2)              \n\t"
403
         "movd %%mm1, (%2, %3)          \n\t"
404
         "add %%"REG_a", %1             \n\t"
405
         "add %%"REG_a", %2             \n\t"
406
         "subl $4, %0                   \n\t"
407
         "jnz 1b                        \n\t"
408
         : "+g"(h), "+r" (pixels),  "+r" (block)
409
         : "r"((long)line_size)
410
         : "%"REG_a, "memory"
411
        );
412 437525c4 Michael Niedermayer
}
413
414 0c1a9eda Zdenek Kabelac
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
415 de6d9b64 Fabrice Bellard
{
416 39825f31 Zdenek Kabelac
    __asm __volatile(
417 bb270c08 Diego Biurrun
         "lea (%3, %3), %%"REG_a"       \n\t"
418 4454dc1b John Dalgliesh
         ASMALIGN(3)
419 bb270c08 Diego Biurrun
         "1:                            \n\t"
420
         "movq (%1), %%mm0              \n\t"
421
         "movq (%1, %3), %%mm1          \n\t"
422
         "movq %%mm0, (%2)              \n\t"
423
         "movq %%mm1, (%2, %3)          \n\t"
424
         "add %%"REG_a", %1             \n\t"
425
         "add %%"REG_a", %2             \n\t"
426
         "movq (%1), %%mm0              \n\t"
427
         "movq (%1, %3), %%mm1          \n\t"
428
         "movq %%mm0, (%2)              \n\t"
429
         "movq %%mm1, (%2, %3)          \n\t"
430
         "add %%"REG_a", %1             \n\t"
431
         "add %%"REG_a", %2             \n\t"
432
         "subl $4, %0                   \n\t"
433
         "jnz 1b                        \n\t"
434
         : "+g"(h), "+r" (pixels),  "+r" (block)
435
         : "r"((long)line_size)
436
         : "%"REG_a, "memory"
437
        );
438 de6d9b64 Fabrice Bellard
}
439
440 0c1a9eda Zdenek Kabelac
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
441 b3184779 Michael Niedermayer
{
442
    __asm __volatile(
443 bb270c08 Diego Biurrun
         "lea (%3, %3), %%"REG_a"       \n\t"
444 4454dc1b John Dalgliesh
         ASMALIGN(3)
445 bb270c08 Diego Biurrun
         "1:                            \n\t"
446
         "movq (%1), %%mm0              \n\t"
447
         "movq 8(%1), %%mm4             \n\t"
448
         "movq (%1, %3), %%mm1          \n\t"
449
         "movq 8(%1, %3), %%mm5         \n\t"
450
         "movq %%mm0, (%2)              \n\t"
451
         "movq %%mm4, 8(%2)             \n\t"
452
         "movq %%mm1, (%2, %3)          \n\t"
453
         "movq %%mm5, 8(%2, %3)         \n\t"
454
         "add %%"REG_a", %1             \n\t"
455
         "add %%"REG_a", %2             \n\t"
456
         "movq (%1), %%mm0              \n\t"
457
         "movq 8(%1), %%mm4             \n\t"
458
         "movq (%1, %3), %%mm1          \n\t"
459
         "movq 8(%1, %3), %%mm5         \n\t"
460
         "movq %%mm0, (%2)              \n\t"
461
         "movq %%mm4, 8(%2)             \n\t"
462
         "movq %%mm1, (%2, %3)          \n\t"
463
         "movq %%mm5, 8(%2, %3)         \n\t"
464
         "add %%"REG_a", %1             \n\t"
465
         "add %%"REG_a", %2             \n\t"
466
         "subl $4, %0                   \n\t"
467
         "jnz 1b                        \n\t"
468
         : "+g"(h), "+r" (pixels),  "+r" (block)
469
         : "r"((long)line_size)
470
         : "%"REG_a, "memory"
471
        );
472 b3184779 Michael Niedermayer
}
473
474 649c00c9 Michael Niedermayer
static void clear_blocks_mmx(DCTELEM *blocks)
475
{
476 39825f31 Zdenek Kabelac
    __asm __volatile(
477 bb270c08 Diego Biurrun
                "pxor %%mm7, %%mm7              \n\t"
478
                "mov $-128*6, %%"REG_a"         \n\t"
479
                "1:                             \n\t"
480
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
481
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
482
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
483
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
484
                "add $32, %%"REG_a"             \n\t"
485
                " js 1b                         \n\t"
486 053dea12 Aurelien Jacobs
                : : "r" (((uint8_t *)blocks)+128*6)
487
                : "%"REG_a
488 649c00c9 Michael Niedermayer
        );
489
}
490
491 764ef400 Mike Melanson
#ifdef CONFIG_ENCODERS
492 0c1a9eda Zdenek Kabelac
static int pix_sum16_mmx(uint8_t * pix, int line_size){
493 084c726b Michael Niedermayer
    const int h=16;
494
    int sum;
495 053dea12 Aurelien Jacobs
    long index= -line_size*h;
496 084c726b Michael Niedermayer
497
    __asm __volatile(
498 bb270c08 Diego Biurrun
                "pxor %%mm7, %%mm7              \n\t"
499
                "pxor %%mm6, %%mm6              \n\t"
500
                "1:                             \n\t"
501
                "movq (%2, %1), %%mm0           \n\t"
502
                "movq (%2, %1), %%mm1           \n\t"
503
                "movq 8(%2, %1), %%mm2          \n\t"
504
                "movq 8(%2, %1), %%mm3          \n\t"
505
                "punpcklbw %%mm7, %%mm0         \n\t"
506
                "punpckhbw %%mm7, %%mm1         \n\t"
507
                "punpcklbw %%mm7, %%mm2         \n\t"
508
                "punpckhbw %%mm7, %%mm3         \n\t"
509
                "paddw %%mm0, %%mm1             \n\t"
510
                "paddw %%mm2, %%mm3             \n\t"
511
                "paddw %%mm1, %%mm3             \n\t"
512
                "paddw %%mm3, %%mm6             \n\t"
513
                "add %3, %1                     \n\t"
514
                " js 1b                         \n\t"
515
                "movq %%mm6, %%mm5              \n\t"
516
                "psrlq $32, %%mm6               \n\t"
517
                "paddw %%mm5, %%mm6             \n\t"
518
                "movq %%mm6, %%mm5              \n\t"
519
                "psrlq $16, %%mm6               \n\t"
520
                "paddw %%mm5, %%mm6             \n\t"
521
                "movd %%mm6, %0                 \n\t"
522
                "andl $0xFFFF, %0               \n\t"
523 084c726b Michael Niedermayer
                : "=&r" (sum), "+r" (index)
524 053dea12 Aurelien Jacobs
                : "r" (pix - index), "r" ((long)line_size)
525 084c726b Michael Niedermayer
        );
526
527
        return sum;
528
}
529 764ef400 Mike Melanson
#endif //CONFIG_ENCODERS
530 084c726b Michael Niedermayer
531 11f18faf Michael Niedermayer
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
532 053dea12 Aurelien Jacobs
    long i=0;
533 11f18faf Michael Niedermayer
    asm volatile(
534 bb270c08 Diego Biurrun
        "1:                             \n\t"
535
        "movq  (%1, %0), %%mm0          \n\t"
536
        "movq  (%2, %0), %%mm1          \n\t"
537
        "paddb %%mm0, %%mm1             \n\t"
538
        "movq %%mm1, (%2, %0)           \n\t"
539
        "movq 8(%1, %0), %%mm0          \n\t"
540
        "movq 8(%2, %0), %%mm1          \n\t"
541
        "paddb %%mm0, %%mm1             \n\t"
542
        "movq %%mm1, 8(%2, %0)          \n\t"
543
        "add $16, %0                    \n\t"
544
        "cmp %3, %0                     \n\t"
545
        " jb 1b                         \n\t"
546 11f18faf Michael Niedermayer
        : "+r" (i)
547 053dea12 Aurelien Jacobs
        : "r"(src), "r"(dst), "r"((long)w-15)
548 11f18faf Michael Niedermayer
    );
549
    for(; i<w; i++)
550
        dst[i+0] += src[i+0];
551
}
552
553 3615e2be Michael Niedermayer
#define H263_LOOP_FILTER \
554 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7              \n\t"\
555
        "movq  %0, %%mm0                \n\t"\
556
        "movq  %0, %%mm1                \n\t"\
557
        "movq  %3, %%mm2                \n\t"\
558
        "movq  %3, %%mm3                \n\t"\
559
        "punpcklbw %%mm7, %%mm0         \n\t"\
560
        "punpckhbw %%mm7, %%mm1         \n\t"\
561
        "punpcklbw %%mm7, %%mm2         \n\t"\
562
        "punpckhbw %%mm7, %%mm3         \n\t"\
563
        "psubw %%mm2, %%mm0             \n\t"\
564
        "psubw %%mm3, %%mm1             \n\t"\
565
        "movq  %1, %%mm2                \n\t"\
566
        "movq  %1, %%mm3                \n\t"\
567
        "movq  %2, %%mm4                \n\t"\
568
        "movq  %2, %%mm5                \n\t"\
569
        "punpcklbw %%mm7, %%mm2         \n\t"\
570
        "punpckhbw %%mm7, %%mm3         \n\t"\
571
        "punpcklbw %%mm7, %%mm4         \n\t"\
572
        "punpckhbw %%mm7, %%mm5         \n\t"\
573
        "psubw %%mm2, %%mm4             \n\t"\
574
        "psubw %%mm3, %%mm5             \n\t"\
575
        "psllw $2, %%mm4                \n\t"\
576
        "psllw $2, %%mm5                \n\t"\
577
        "paddw %%mm0, %%mm4             \n\t"\
578
        "paddw %%mm1, %%mm5             \n\t"\
579
        "pxor %%mm6, %%mm6              \n\t"\
580
        "pcmpgtw %%mm4, %%mm6           \n\t"\
581
        "pcmpgtw %%mm5, %%mm7           \n\t"\
582
        "pxor %%mm6, %%mm4              \n\t"\
583
        "pxor %%mm7, %%mm5              \n\t"\
584
        "psubw %%mm6, %%mm4             \n\t"\
585
        "psubw %%mm7, %%mm5             \n\t"\
586
        "psrlw $3, %%mm4                \n\t"\
587
        "psrlw $3, %%mm5                \n\t"\
588
        "packuswb %%mm5, %%mm4          \n\t"\
589
        "packsswb %%mm7, %%mm6          \n\t"\
590
        "pxor %%mm7, %%mm7              \n\t"\
591
        "movd %4, %%mm2                 \n\t"\
592
        "punpcklbw %%mm2, %%mm2         \n\t"\
593
        "punpcklbw %%mm2, %%mm2         \n\t"\
594
        "punpcklbw %%mm2, %%mm2         \n\t"\
595
        "psubusb %%mm4, %%mm2           \n\t"\
596
        "movq %%mm2, %%mm3              \n\t"\
597
        "psubusb %%mm4, %%mm3           \n\t"\
598
        "psubb %%mm3, %%mm2             \n\t"\
599
        "movq %1, %%mm3                 \n\t"\
600
        "movq %2, %%mm4                 \n\t"\
601
        "pxor %%mm6, %%mm3              \n\t"\
602
        "pxor %%mm6, %%mm4              \n\t"\
603
        "paddusb %%mm2, %%mm3           \n\t"\
604
        "psubusb %%mm2, %%mm4           \n\t"\
605
        "pxor %%mm6, %%mm3              \n\t"\
606
        "pxor %%mm6, %%mm4              \n\t"\
607
        "paddusb %%mm2, %%mm2           \n\t"\
608
        "packsswb %%mm1, %%mm0          \n\t"\
609
        "pcmpgtb %%mm0, %%mm7           \n\t"\
610
        "pxor %%mm7, %%mm0              \n\t"\
611
        "psubb %%mm7, %%mm0             \n\t"\
612
        "movq %%mm0, %%mm1              \n\t"\
613
        "psubusb %%mm2, %%mm0           \n\t"\
614
        "psubb %%mm0, %%mm1             \n\t"\
615
        "pand %5, %%mm1                 \n\t"\
616
        "psrlw $2, %%mm1                \n\t"\
617
        "pxor %%mm7, %%mm1              \n\t"\
618
        "psubb %%mm7, %%mm1             \n\t"\
619
        "movq %0, %%mm5                 \n\t"\
620
        "movq %3, %%mm6                 \n\t"\
621
        "psubb %%mm1, %%mm5             \n\t"\
622
        "paddb %%mm1, %%mm6             \n\t"
623 3615e2be Michael Niedermayer
624 359f98de Michael Niedermayer
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
625 73f51a4d Aurelien Jacobs
    if(ENABLE_ANY_H263) {
626 359f98de Michael Niedermayer
    const int strength= ff_h263_loop_filter_strength[qscale];
627
628
    asm volatile(
629 115329f1 Diego Biurrun
630 3615e2be Michael Niedermayer
        H263_LOOP_FILTER
631 115329f1 Diego Biurrun
632 bb270c08 Diego Biurrun
        "movq %%mm3, %1                 \n\t"
633
        "movq %%mm4, %2                 \n\t"
634
        "movq %%mm5, %0                 \n\t"
635
        "movq %%mm6, %3                 \n\t"
636 359f98de Michael Niedermayer
        : "+m" (*(uint64_t*)(src - 2*stride)),
637
          "+m" (*(uint64_t*)(src - 1*stride)),
638
          "+m" (*(uint64_t*)(src + 0*stride)),
639
          "+m" (*(uint64_t*)(src + 1*stride))
640
        : "g" (2*strength), "m"(ff_pb_FC)
641
    );
642 73f51a4d Aurelien Jacobs
    }
643 359f98de Michael Niedermayer
}
644
645 3615e2be Michael Niedermayer
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
646
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
647 bb270c08 Diego Biurrun
        "movd  %4, %%mm0                \n\t"
648
        "movd  %5, %%mm1                \n\t"
649
        "movd  %6, %%mm2                \n\t"
650
        "movd  %7, %%mm3                \n\t"
651
        "punpcklbw %%mm1, %%mm0         \n\t"
652
        "punpcklbw %%mm3, %%mm2         \n\t"
653
        "movq %%mm0, %%mm1              \n\t"
654
        "punpcklwd %%mm2, %%mm0         \n\t"
655
        "punpckhwd %%mm2, %%mm1         \n\t"
656
        "movd  %%mm0, %0                \n\t"
657
        "punpckhdq %%mm0, %%mm0         \n\t"
658
        "movd  %%mm0, %1                \n\t"
659
        "movd  %%mm1, %2                \n\t"
660
        "punpckhdq %%mm1, %%mm1         \n\t"
661
        "movd  %%mm1, %3                \n\t"
662 115329f1 Diego Biurrun
663 3615e2be Michael Niedermayer
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
664
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
665
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
666
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
667
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
668
           "m" (*(uint32_t*)(src + 1*src_stride)),
669
           "m" (*(uint32_t*)(src + 2*src_stride)),
670
           "m" (*(uint32_t*)(src + 3*src_stride))
671
    );
672
}
673
674
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
675 73f51a4d Aurelien Jacobs
    if(ENABLE_ANY_H263) {
676 3615e2be Michael Niedermayer
    const int strength= ff_h263_loop_filter_strength[qscale];
677
    uint64_t temp[4] __attribute__ ((aligned(8)));
678
    uint8_t *btemp= (uint8_t*)temp;
679 115329f1 Diego Biurrun
680 3615e2be Michael Niedermayer
    src -= 2;
681
682
    transpose4x4(btemp  , src           , 8, stride);
683
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
684
    asm volatile(
685
        H263_LOOP_FILTER // 5 3 4 6
686 115329f1 Diego Biurrun
687 3615e2be Michael Niedermayer
        : "+m" (temp[0]),
688
          "+m" (temp[1]),
689
          "+m" (temp[2]),
690
          "+m" (temp[3])
691
        : "g" (2*strength), "m"(ff_pb_FC)
692
    );
693
694
    asm volatile(
695 bb270c08 Diego Biurrun
        "movq %%mm5, %%mm1              \n\t"
696
        "movq %%mm4, %%mm0              \n\t"
697
        "punpcklbw %%mm3, %%mm5         \n\t"
698
        "punpcklbw %%mm6, %%mm4         \n\t"
699
        "punpckhbw %%mm3, %%mm1         \n\t"
700
        "punpckhbw %%mm6, %%mm0         \n\t"
701
        "movq %%mm5, %%mm3              \n\t"
702
        "movq %%mm1, %%mm6              \n\t"
703
        "punpcklwd %%mm4, %%mm5         \n\t"
704
        "punpcklwd %%mm0, %%mm1         \n\t"
705
        "punpckhwd %%mm4, %%mm3         \n\t"
706
        "punpckhwd %%mm0, %%mm6         \n\t"
707
        "movd %%mm5, (%0)               \n\t"
708
        "punpckhdq %%mm5, %%mm5         \n\t"
709
        "movd %%mm5, (%0,%2)            \n\t"
710
        "movd %%mm3, (%0,%2,2)          \n\t"
711
        "punpckhdq %%mm3, %%mm3         \n\t"
712
        "movd %%mm3, (%0,%3)            \n\t"
713
        "movd %%mm1, (%1)               \n\t"
714
        "punpckhdq %%mm1, %%mm1         \n\t"
715
        "movd %%mm1, (%1,%2)            \n\t"
716
        "movd %%mm6, (%1,%2,2)          \n\t"
717
        "punpckhdq %%mm6, %%mm6         \n\t"
718
        "movd %%mm6, (%1,%3)            \n\t"
719 4d9ae03b Martin Drab
        :: "r" (src),
720
           "r" (src + 4*stride),
721
           "r" ((long)   stride ),
722
           "r" ((long)(3*stride))
723 3615e2be Michael Niedermayer
    );
724 73f51a4d Aurelien Jacobs
    }
725 3615e2be Michael Niedermayer
}
726
727 764ef400 Mike Melanson
#ifdef CONFIG_ENCODERS
728 2a006cd3 Felix von Leitner
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
729
    int tmp;
730
  asm volatile (
731
      "movl $16,%%ecx\n"
732
      "pxor %%mm0,%%mm0\n"
733
      "pxor %%mm7,%%mm7\n"
734
      "1:\n"
735 bb270c08 Diego Biurrun
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
736
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
737 2a006cd3 Felix von Leitner
738 bb270c08 Diego Biurrun
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
739 2a006cd3 Felix von Leitner
740 bb270c08 Diego Biurrun
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
741
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
742 2a006cd3 Felix von Leitner
743 bb270c08 Diego Biurrun
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
744
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
745
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
746 2a006cd3 Felix von Leitner
747 bb270c08 Diego Biurrun
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
748
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
749 2a006cd3 Felix von Leitner
750
      "pmaddwd %%mm3,%%mm3\n"
751
      "pmaddwd %%mm4,%%mm4\n"
752
753 bb270c08 Diego Biurrun
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
754
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
755 2a006cd3 Felix von Leitner
      "paddd %%mm3,%%mm4\n"
756
      "paddd %%mm2,%%mm7\n"
757
758 053dea12 Aurelien Jacobs
      "add %2, %0\n"
759 2a006cd3 Felix von Leitner
      "paddd %%mm4,%%mm7\n"
760
      "dec %%ecx\n"
761
      "jnz 1b\n"
762
763
      "movq %%mm7,%%mm1\n"
764 bb270c08 Diego Biurrun
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
765 2a006cd3 Felix von Leitner
      "paddd %%mm7,%%mm1\n"
766
      "movd %%mm1,%1\n"
767 053dea12 Aurelien Jacobs
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
768 2a006cd3 Felix von Leitner
    return tmp;
769
}
770
771 1ec4df0f Michael Niedermayer
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
772
    int tmp;
773
  asm volatile (
774
      "movl %4,%%ecx\n"
775 5693c083 Loren Merritt
      "shr $1,%%ecx\n"
776 bb270c08 Diego Biurrun
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
777
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
778 1ec4df0f Michael Niedermayer
      "1:\n"
779 bb270c08 Diego Biurrun
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
780
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
781
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
782
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
783 1ec4df0f Michael Niedermayer
784 5693c083 Loren Merritt
      /* todo: mm1-mm2, mm3-mm4 */
785
      /* algo: substract mm1 from mm2 with saturation and vice versa */
786
      /*       OR the results to get absolute difference */
787 1ec4df0f Michael Niedermayer
      "movq %%mm1,%%mm5\n"
788 5693c083 Loren Merritt
      "movq %%mm3,%%mm6\n"
789 1ec4df0f Michael Niedermayer
      "psubusb %%mm2,%%mm1\n"
790 5693c083 Loren Merritt
      "psubusb %%mm4,%%mm3\n"
791 1ec4df0f Michael Niedermayer
      "psubusb %%mm5,%%mm2\n"
792 5693c083 Loren Merritt
      "psubusb %%mm6,%%mm4\n"
793 1ec4df0f Michael Niedermayer
794
      "por %%mm1,%%mm2\n"
795 5693c083 Loren Merritt
      "por %%mm3,%%mm4\n"
796 1ec4df0f Michael Niedermayer
797 5693c083 Loren Merritt
      /* now convert to 16-bit vectors so we can square them */
798 1ec4df0f Michael Niedermayer
      "movq %%mm2,%%mm1\n"
799 5693c083 Loren Merritt
      "movq %%mm4,%%mm3\n"
800 1ec4df0f Michael Niedermayer
801
      "punpckhbw %%mm0,%%mm2\n"
802 5693c083 Loren Merritt
      "punpckhbw %%mm0,%%mm4\n"
803 bb270c08 Diego Biurrun
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
804
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
805 1ec4df0f Michael Niedermayer
806
      "pmaddwd %%mm2,%%mm2\n"
807 5693c083 Loren Merritt
      "pmaddwd %%mm4,%%mm4\n"
808 1ec4df0f Michael Niedermayer
      "pmaddwd %%mm1,%%mm1\n"
809 5693c083 Loren Merritt
      "pmaddwd %%mm3,%%mm3\n"
810 1ec4df0f Michael Niedermayer
811 bb270c08 Diego Biurrun
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
812
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
813 1ec4df0f Michael Niedermayer
814
      "paddd %%mm2,%%mm1\n"
815 5693c083 Loren Merritt
      "paddd %%mm4,%%mm3\n"
816 1ec4df0f Michael Niedermayer
      "paddd %%mm1,%%mm7\n"
817 5693c083 Loren Merritt
      "paddd %%mm3,%%mm7\n"
818 1ec4df0f Michael Niedermayer
819
      "decl %%ecx\n"
820
      "jnz 1b\n"
821
822
      "movq %%mm7,%%mm1\n"
823 bb270c08 Diego Biurrun
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
824 1ec4df0f Michael Niedermayer
      "paddd %%mm7,%%mm1\n"
825
      "movd %%mm1,%2\n"
826 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
827 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
828 1ec4df0f Michael Niedermayer
      : "%ecx");
829
    return tmp;
830
}
831
832 bb198e19 Michael Niedermayer
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
833 2a006cd3 Felix von Leitner
    int tmp;
834
  asm volatile (
835 bb198e19 Michael Niedermayer
      "movl %4,%%ecx\n"
836 bb270c08 Diego Biurrun
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
837
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
838 2a006cd3 Felix von Leitner
      "1:\n"
839 bb270c08 Diego Biurrun
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
840
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
841
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
842
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
843 2a006cd3 Felix von Leitner
844
      /* todo: mm1-mm2, mm3-mm4 */
845
      /* algo: substract mm1 from mm2 with saturation and vice versa */
846
      /*       OR the results to get absolute difference */
847
      "movq %%mm1,%%mm5\n"
848
      "movq %%mm3,%%mm6\n"
849
      "psubusb %%mm2,%%mm1\n"
850
      "psubusb %%mm4,%%mm3\n"
851
      "psubusb %%mm5,%%mm2\n"
852
      "psubusb %%mm6,%%mm4\n"
853
854
      "por %%mm1,%%mm2\n"
855
      "por %%mm3,%%mm4\n"
856
857
      /* now convert to 16-bit vectors so we can square them */
858
      "movq %%mm2,%%mm1\n"
859
      "movq %%mm4,%%mm3\n"
860
861
      "punpckhbw %%mm0,%%mm2\n"
862
      "punpckhbw %%mm0,%%mm4\n"
863 bb270c08 Diego Biurrun
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
864
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
865 2a006cd3 Felix von Leitner
866
      "pmaddwd %%mm2,%%mm2\n"
867
      "pmaddwd %%mm4,%%mm4\n"
868
      "pmaddwd %%mm1,%%mm1\n"
869
      "pmaddwd %%mm3,%%mm3\n"
870
871 053dea12 Aurelien Jacobs
      "add %3,%0\n"
872
      "add %3,%1\n"
873 2a006cd3 Felix von Leitner
874
      "paddd %%mm2,%%mm1\n"
875
      "paddd %%mm4,%%mm3\n"
876
      "paddd %%mm1,%%mm7\n"
877
      "paddd %%mm3,%%mm7\n"
878
879
      "decl %%ecx\n"
880
      "jnz 1b\n"
881
882
      "movq %%mm7,%%mm1\n"
883 bb270c08 Diego Biurrun
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
884 2a006cd3 Felix von Leitner
      "paddd %%mm7,%%mm1\n"
885
      "movd %%mm1,%2\n"
886 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
887 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
888 bb198e19 Michael Niedermayer
      : "%ecx");
889 2a006cd3 Felix von Leitner
    return tmp;
890
}
891
892 5693c083 Loren Merritt
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
893
    int tmp;
894
  asm volatile (
895
      "shr $1,%2\n"
896 bb270c08 Diego Biurrun
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
897
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
898 5693c083 Loren Merritt
      "1:\n"
899 bb270c08 Diego Biurrun
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
900
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
901
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
902
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
903 5693c083 Loren Merritt
904
      /* todo: mm1-mm2, mm3-mm4 */
905
      /* algo: substract mm1 from mm2 with saturation and vice versa */
906
      /*       OR the results to get absolute difference */
907
      "movdqa %%xmm1,%%xmm5\n"
908
      "movdqa %%xmm3,%%xmm6\n"
909
      "psubusb %%xmm2,%%xmm1\n"
910
      "psubusb %%xmm4,%%xmm3\n"
911
      "psubusb %%xmm5,%%xmm2\n"
912
      "psubusb %%xmm6,%%xmm4\n"
913
914
      "por %%xmm1,%%xmm2\n"
915
      "por %%xmm3,%%xmm4\n"
916
917
      /* now convert to 16-bit vectors so we can square them */
918
      "movdqa %%xmm2,%%xmm1\n"
919
      "movdqa %%xmm4,%%xmm3\n"
920
921
      "punpckhbw %%xmm0,%%xmm2\n"
922
      "punpckhbw %%xmm0,%%xmm4\n"
923 bb270c08 Diego Biurrun
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
924
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
925 5693c083 Loren Merritt
926
      "pmaddwd %%xmm2,%%xmm2\n"
927
      "pmaddwd %%xmm4,%%xmm4\n"
928
      "pmaddwd %%xmm1,%%xmm1\n"
929
      "pmaddwd %%xmm3,%%xmm3\n"
930
931 bb270c08 Diego Biurrun
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
932
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
933 5693c083 Loren Merritt
934
      "paddd %%xmm2,%%xmm1\n"
935
      "paddd %%xmm4,%%xmm3\n"
936
      "paddd %%xmm1,%%xmm7\n"
937
      "paddd %%xmm3,%%xmm7\n"
938
939
      "decl %2\n"
940
      "jnz 1b\n"
941
942
      "movdqa %%xmm7,%%xmm1\n"
943 bb270c08 Diego Biurrun
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
944 5693c083 Loren Merritt
      "paddd %%xmm1,%%xmm7\n"
945
      "movdqa %%xmm7,%%xmm1\n"
946 bb270c08 Diego Biurrun
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
947 5693c083 Loren Merritt
      "paddd %%xmm1,%%xmm7\n"
948
      "movd %%xmm7,%3\n"
949 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
950 5693c083 Loren Merritt
      : "r" ((long)line_size));
951
    return tmp;
952
}
953
954 1ec4df0f Michael Niedermayer
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
955
    int tmp;
956
  asm volatile (
957
      "movl %3,%%ecx\n"
958
      "pxor %%mm7,%%mm7\n"
959
      "pxor %%mm6,%%mm6\n"
960 115329f1 Diego Biurrun
961 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
962
      "movq %%mm0, %%mm1\n"
963
      "psllq $8, %%mm0\n"
964
      "psrlq $8, %%mm1\n"
965
      "psrlq $8, %%mm0\n"
966
      "movq %%mm0, %%mm2\n"
967
      "movq %%mm1, %%mm3\n"
968
      "punpcklbw %%mm7,%%mm0\n"
969
      "punpcklbw %%mm7,%%mm1\n"
970
      "punpckhbw %%mm7,%%mm2\n"
971
      "punpckhbw %%mm7,%%mm3\n"
972
      "psubw %%mm1, %%mm0\n"
973
      "psubw %%mm3, %%mm2\n"
974 115329f1 Diego Biurrun
975 053dea12 Aurelien Jacobs
      "add %2,%0\n"
976 115329f1 Diego Biurrun
977 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
978
      "movq %%mm4, %%mm1\n"
979
      "psllq $8, %%mm4\n"
980
      "psrlq $8, %%mm1\n"
981
      "psrlq $8, %%mm4\n"
982
      "movq %%mm4, %%mm5\n"
983
      "movq %%mm1, %%mm3\n"
984
      "punpcklbw %%mm7,%%mm4\n"
985
      "punpcklbw %%mm7,%%mm1\n"
986
      "punpckhbw %%mm7,%%mm5\n"
987
      "punpckhbw %%mm7,%%mm3\n"
988
      "psubw %%mm1, %%mm4\n"
989
      "psubw %%mm3, %%mm5\n"
990
      "psubw %%mm4, %%mm0\n"
991
      "psubw %%mm5, %%mm2\n"
992
      "pxor %%mm3, %%mm3\n"
993
      "pxor %%mm1, %%mm1\n"
994
      "pcmpgtw %%mm0, %%mm3\n\t"
995
      "pcmpgtw %%mm2, %%mm1\n\t"
996
      "pxor %%mm3, %%mm0\n"
997
      "pxor %%mm1, %%mm2\n"
998 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
999 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1000
      "paddw %%mm0, %%mm2\n"
1001
      "paddw %%mm2, %%mm6\n"
1002
1003 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1004 1ec4df0f Michael Niedermayer
      "1:\n"
1005 115329f1 Diego Biurrun
1006 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
1007
      "movq %%mm0, %%mm1\n"
1008
      "psllq $8, %%mm0\n"
1009
      "psrlq $8, %%mm1\n"
1010
      "psrlq $8, %%mm0\n"
1011
      "movq %%mm0, %%mm2\n"
1012
      "movq %%mm1, %%mm3\n"
1013
      "punpcklbw %%mm7,%%mm0\n"
1014
      "punpcklbw %%mm7,%%mm1\n"
1015
      "punpckhbw %%mm7,%%mm2\n"
1016
      "punpckhbw %%mm7,%%mm3\n"
1017
      "psubw %%mm1, %%mm0\n"
1018
      "psubw %%mm3, %%mm2\n"
1019
      "psubw %%mm0, %%mm4\n"
1020
      "psubw %%mm2, %%mm5\n"
1021
      "pxor %%mm3, %%mm3\n"
1022
      "pxor %%mm1, %%mm1\n"
1023
      "pcmpgtw %%mm4, %%mm3\n\t"
1024
      "pcmpgtw %%mm5, %%mm1\n\t"
1025
      "pxor %%mm3, %%mm4\n"
1026
      "pxor %%mm1, %%mm5\n"
1027 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm4\n"
1028 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm5\n"
1029
      "paddw %%mm4, %%mm5\n"
1030
      "paddw %%mm5, %%mm6\n"
1031 115329f1 Diego Biurrun
1032 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1033 115329f1 Diego Biurrun
1034 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
1035
      "movq %%mm4, %%mm1\n"
1036
      "psllq $8, %%mm4\n"
1037
      "psrlq $8, %%mm1\n"
1038
      "psrlq $8, %%mm4\n"
1039
      "movq %%mm4, %%mm5\n"
1040
      "movq %%mm1, %%mm3\n"
1041
      "punpcklbw %%mm7,%%mm4\n"
1042
      "punpcklbw %%mm7,%%mm1\n"
1043
      "punpckhbw %%mm7,%%mm5\n"
1044
      "punpckhbw %%mm7,%%mm3\n"
1045
      "psubw %%mm1, %%mm4\n"
1046
      "psubw %%mm3, %%mm5\n"
1047
      "psubw %%mm4, %%mm0\n"
1048
      "psubw %%mm5, %%mm2\n"
1049
      "pxor %%mm3, %%mm3\n"
1050
      "pxor %%mm1, %%mm1\n"
1051
      "pcmpgtw %%mm0, %%mm3\n\t"
1052
      "pcmpgtw %%mm2, %%mm1\n\t"
1053
      "pxor %%mm3, %%mm0\n"
1054
      "pxor %%mm1, %%mm2\n"
1055 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
1056 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1057
      "paddw %%mm0, %%mm2\n"
1058
      "paddw %%mm2, %%mm6\n"
1059
1060 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1061 1ec4df0f Michael Niedermayer
      "subl $2, %%ecx\n"
1062
      " jnz 1b\n"
1063
1064
      "movq %%mm6, %%mm0\n"
1065
      "punpcklwd %%mm7,%%mm0\n"
1066
      "punpckhwd %%mm7,%%mm6\n"
1067
      "paddd %%mm0, %%mm6\n"
1068 115329f1 Diego Biurrun
1069 1ec4df0f Michael Niedermayer
      "movq %%mm6,%%mm0\n"
1070
      "psrlq $32, %%mm6\n"
1071
      "paddd %%mm6,%%mm0\n"
1072
      "movd %%mm0,%1\n"
1073 115329f1 Diego Biurrun
      : "+r" (pix1), "=r"(tmp)
1074 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "g" (h-2)
1075 1ec4df0f Michael Niedermayer
      : "%ecx");
1076
      return tmp;
1077
}
1078
1079
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1080
    int tmp;
1081
    uint8_t * pix= pix1;
1082
  asm volatile (
1083
      "movl %3,%%ecx\n"
1084
      "pxor %%mm7,%%mm7\n"
1085
      "pxor %%mm6,%%mm6\n"
1086 115329f1 Diego Biurrun
1087 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
1088
      "movq 1(%0),%%mm1\n"
1089
      "movq %%mm0, %%mm2\n"
1090
      "movq %%mm1, %%mm3\n"
1091
      "punpcklbw %%mm7,%%mm0\n"
1092
      "punpcklbw %%mm7,%%mm1\n"
1093
      "punpckhbw %%mm7,%%mm2\n"
1094
      "punpckhbw %%mm7,%%mm3\n"
1095
      "psubw %%mm1, %%mm0\n"
1096
      "psubw %%mm3, %%mm2\n"
1097 115329f1 Diego Biurrun
1098 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1099 115329f1 Diego Biurrun
1100 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
1101
      "movq 1(%0),%%mm1\n"
1102
      "movq %%mm4, %%mm5\n"
1103
      "movq %%mm1, %%mm3\n"
1104
      "punpcklbw %%mm7,%%mm4\n"
1105
      "punpcklbw %%mm7,%%mm1\n"
1106
      "punpckhbw %%mm7,%%mm5\n"
1107
      "punpckhbw %%mm7,%%mm3\n"
1108
      "psubw %%mm1, %%mm4\n"
1109
      "psubw %%mm3, %%mm5\n"
1110
      "psubw %%mm4, %%mm0\n"
1111
      "psubw %%mm5, %%mm2\n"
1112
      "pxor %%mm3, %%mm3\n"
1113
      "pxor %%mm1, %%mm1\n"
1114
      "pcmpgtw %%mm0, %%mm3\n\t"
1115
      "pcmpgtw %%mm2, %%mm1\n\t"
1116
      "pxor %%mm3, %%mm0\n"
1117
      "pxor %%mm1, %%mm2\n"
1118 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
1119 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1120
      "paddw %%mm0, %%mm2\n"
1121
      "paddw %%mm2, %%mm6\n"
1122
1123 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1124 1ec4df0f Michael Niedermayer
      "1:\n"
1125 115329f1 Diego Biurrun
1126 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
1127
      "movq 1(%0),%%mm1\n"
1128
      "movq %%mm0, %%mm2\n"
1129
      "movq %%mm1, %%mm3\n"
1130
      "punpcklbw %%mm7,%%mm0\n"
1131
      "punpcklbw %%mm7,%%mm1\n"
1132
      "punpckhbw %%mm7,%%mm2\n"
1133
      "punpckhbw %%mm7,%%mm3\n"
1134
      "psubw %%mm1, %%mm0\n"
1135
      "psubw %%mm3, %%mm2\n"
1136
      "psubw %%mm0, %%mm4\n"
1137
      "psubw %%mm2, %%mm5\n"
1138
      "pxor %%mm3, %%mm3\n"
1139
      "pxor %%mm1, %%mm1\n"
1140
      "pcmpgtw %%mm4, %%mm3\n\t"
1141
      "pcmpgtw %%mm5, %%mm1\n\t"
1142
      "pxor %%mm3, %%mm4\n"
1143
      "pxor %%mm1, %%mm5\n"
1144
      "psubw %%mm3, %%mm4\n"
1145
      "psubw %%mm1, %%mm5\n"
1146
      "paddw %%mm4, %%mm5\n"
1147
      "paddw %%mm5, %%mm6\n"
1148 115329f1 Diego Biurrun
1149 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1150 115329f1 Diego Biurrun
1151 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
1152
      "movq 1(%0),%%mm1\n"
1153
      "movq %%mm4, %%mm5\n"
1154
      "movq %%mm1, %%mm3\n"
1155
      "punpcklbw %%mm7,%%mm4\n"
1156
      "punpcklbw %%mm7,%%mm1\n"
1157
      "punpckhbw %%mm7,%%mm5\n"
1158
      "punpckhbw %%mm7,%%mm3\n"
1159
      "psubw %%mm1, %%mm4\n"
1160
      "psubw %%mm3, %%mm5\n"
1161
      "psubw %%mm4, %%mm0\n"
1162
      "psubw %%mm5, %%mm2\n"
1163
      "pxor %%mm3, %%mm3\n"
1164
      "pxor %%mm1, %%mm1\n"
1165
      "pcmpgtw %%mm0, %%mm3\n\t"
1166
      "pcmpgtw %%mm2, %%mm1\n\t"
1167
      "pxor %%mm3, %%mm0\n"
1168
      "pxor %%mm1, %%mm2\n"
1169 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
1170 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1171
      "paddw %%mm0, %%mm2\n"
1172
      "paddw %%mm2, %%mm6\n"
1173
1174 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1175 1ec4df0f Michael Niedermayer
      "subl $2, %%ecx\n"
1176
      " jnz 1b\n"
1177
1178
      "movq %%mm6, %%mm0\n"
1179
      "punpcklwd %%mm7,%%mm0\n"
1180
      "punpckhwd %%mm7,%%mm6\n"
1181
      "paddd %%mm0, %%mm6\n"
1182 115329f1 Diego Biurrun
1183 1ec4df0f Michael Niedermayer
      "movq %%mm6,%%mm0\n"
1184
      "psrlq $32, %%mm6\n"
1185
      "paddd %%mm6,%%mm0\n"
1186
      "movd %%mm0,%1\n"
1187 115329f1 Diego Biurrun
      : "+r" (pix1), "=r"(tmp)
1188 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "g" (h-2)
1189 1ec4df0f Michael Niedermayer
      : "%ecx");
1190
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1191
}
1192
1193 79396ac6 Måns Rullgård
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1194
    MpegEncContext *c = p;
1195 ea15df80 Loren Merritt
    int score1, score2;
1196
1197
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1198
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1199
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1200 1ec4df0f Michael Niedermayer
1201 c26abfa5 Diego Biurrun
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1202
    else  return score1 + FFABS(score2)*8;
1203 1ec4df0f Michael Niedermayer
}
1204
1205 79396ac6 Måns Rullgård
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1206
    MpegEncContext *c = p;
1207 1ec4df0f Michael Niedermayer
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1208
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1209
1210 c26abfa5 Diego Biurrun
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1211
    else  return score1 + FFABS(score2)*8;
1212 1ec4df0f Michael Niedermayer
}
1213
1214 622348f9 Michael Niedermayer
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1215
    int tmp;
1216 115329f1 Diego Biurrun
1217 622348f9 Michael Niedermayer
    assert( (((int)pix) & 7) == 0);
1218
    assert((line_size &7) ==0);
1219 115329f1 Diego Biurrun
1220 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1221
      "movq (%0), %%mm2\n"\
1222
      "movq 8(%0), %%mm3\n"\
1223 053dea12 Aurelien Jacobs
      "add %2,%0\n"\
1224 622348f9 Michael Niedermayer
      "movq %%mm2, " #out0 "\n"\
1225
      "movq %%mm3, " #out1 "\n"\
1226
      "psubusb " #in0 ", %%mm2\n"\
1227
      "psubusb " #in1 ", %%mm3\n"\
1228
      "psubusb " #out0 ", " #in0 "\n"\
1229
      "psubusb " #out1 ", " #in1 "\n"\
1230
      "por %%mm2, " #in0 "\n"\
1231
      "por %%mm3, " #in1 "\n"\
1232
      "movq " #in0 ", %%mm2\n"\
1233
      "movq " #in1 ", %%mm3\n"\
1234
      "punpcklbw %%mm7, " #in0 "\n"\
1235
      "punpcklbw %%mm7, " #in1 "\n"\
1236
      "punpckhbw %%mm7, %%mm2\n"\
1237
      "punpckhbw %%mm7, %%mm3\n"\
1238
      "paddw " #in1 ", " #in0 "\n"\
1239
      "paddw %%mm3, %%mm2\n"\
1240
      "paddw %%mm2, " #in0 "\n"\
1241
      "paddw " #in0 ", %%mm6\n"
1242
1243 115329f1 Diego Biurrun
1244 622348f9 Michael Niedermayer
  asm volatile (
1245
      "movl %3,%%ecx\n"
1246
      "pxor %%mm6,%%mm6\n"
1247
      "pxor %%mm7,%%mm7\n"
1248
      "movq (%0),%%mm0\n"
1249
      "movq 8(%0),%%mm1\n"
1250 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1251 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1252
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1253
      "1:\n"
1254 115329f1 Diego Biurrun
1255 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1256 115329f1 Diego Biurrun
1257 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1258 115329f1 Diego Biurrun
1259 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1260
      "jnz 1b\n"
1261
1262
      "movq %%mm6,%%mm0\n"
1263
      "psrlq $32, %%mm6\n"
1264
      "paddw %%mm6,%%mm0\n"
1265
      "movq %%mm0,%%mm6\n"
1266
      "psrlq $16, %%mm0\n"
1267
      "paddw %%mm6,%%mm0\n"
1268
      "movd %%mm0,%1\n"
1269 115329f1 Diego Biurrun
      : "+r" (pix), "=r"(tmp)
1270 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1271 622348f9 Michael Niedermayer
      : "%ecx");
1272
    return tmp & 0xFFFF;
1273
}
1274
#undef SUM
1275
1276
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1277
    int tmp;
1278 115329f1 Diego Biurrun
1279 622348f9 Michael Niedermayer
    assert( (((int)pix) & 7) == 0);
1280
    assert((line_size &7) ==0);
1281 115329f1 Diego Biurrun
1282 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1283
      "movq (%0), " #out0 "\n"\
1284
      "movq 8(%0), " #out1 "\n"\
1285 053dea12 Aurelien Jacobs
      "add %2,%0\n"\
1286 622348f9 Michael Niedermayer
      "psadbw " #out0 ", " #in0 "\n"\
1287
      "psadbw " #out1 ", " #in1 "\n"\
1288
      "paddw " #in1 ", " #in0 "\n"\
1289
      "paddw " #in0 ", %%mm6\n"
1290
1291
  asm volatile (
1292
      "movl %3,%%ecx\n"
1293
      "pxor %%mm6,%%mm6\n"
1294
      "pxor %%mm7,%%mm7\n"
1295
      "movq (%0),%%mm0\n"
1296
      "movq 8(%0),%%mm1\n"
1297 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1298 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1299
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1300
      "1:\n"
1301 115329f1 Diego Biurrun
1302 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1303 115329f1 Diego Biurrun
1304 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1305 115329f1 Diego Biurrun
1306 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1307
      "jnz 1b\n"
1308
1309
      "movd %%mm6,%1\n"
1310 115329f1 Diego Biurrun
      : "+r" (pix), "=r"(tmp)
1311 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1312 622348f9 Michael Niedermayer
      : "%ecx");
1313
    return tmp;
1314
}
1315
#undef SUM
1316
1317
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1318
    int tmp;
1319 115329f1 Diego Biurrun
1320 622348f9 Michael Niedermayer
    assert( (((int)pix1) & 7) == 0);
1321
    assert( (((int)pix2) & 7) == 0);
1322
    assert((line_size &7) ==0);
1323 115329f1 Diego Biurrun
1324 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1325
      "movq (%0),%%mm2\n"\
1326
      "movq (%1)," #out0 "\n"\
1327
      "movq 8(%0),%%mm3\n"\
1328
      "movq 8(%1)," #out1 "\n"\
1329 053dea12 Aurelien Jacobs
      "add %3,%0\n"\
1330
      "add %3,%1\n"\
1331 622348f9 Michael Niedermayer
      "psubb " #out0 ", %%mm2\n"\
1332
      "psubb " #out1 ", %%mm3\n"\
1333
      "pxor %%mm7, %%mm2\n"\
1334
      "pxor %%mm7, %%mm3\n"\
1335
      "movq %%mm2, " #out0 "\n"\
1336
      "movq %%mm3, " #out1 "\n"\
1337
      "psubusb " #in0 ", %%mm2\n"\
1338
      "psubusb " #in1 ", %%mm3\n"\
1339
      "psubusb " #out0 ", " #in0 "\n"\
1340
      "psubusb " #out1 ", " #in1 "\n"\
1341
      "por %%mm2, " #in0 "\n"\
1342
      "por %%mm3, " #in1 "\n"\
1343
      "movq " #in0 ", %%mm2\n"\
1344
      "movq " #in1 ", %%mm3\n"\
1345
      "punpcklbw %%mm7, " #in0 "\n"\
1346
      "punpcklbw %%mm7, " #in1 "\n"\
1347
      "punpckhbw %%mm7, %%mm2\n"\
1348
      "punpckhbw %%mm7, %%mm3\n"\
1349
      "paddw " #in1 ", " #in0 "\n"\
1350
      "paddw %%mm3, %%mm2\n"\
1351
      "paddw %%mm2, " #in0 "\n"\
1352
      "paddw " #in0 ", %%mm6\n"
1353
1354 115329f1 Diego Biurrun
1355 622348f9 Michael Niedermayer
  asm volatile (
1356
      "movl %4,%%ecx\n"
1357
      "pxor %%mm6,%%mm6\n"
1358
      "pcmpeqw %%mm7,%%mm7\n"
1359
      "psllw $15, %%mm7\n"
1360
      "packsswb %%mm7, %%mm7\n"
1361
      "movq (%0),%%mm0\n"
1362
      "movq (%1),%%mm2\n"
1363
      "movq 8(%0),%%mm1\n"
1364
      "movq 8(%1),%%mm3\n"
1365 053dea12 Aurelien Jacobs
      "add %3,%0\n"
1366
      "add %3,%1\n"
1367 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1368
      "psubb %%mm2, %%mm0\n"
1369
      "psubb %%mm3, %%mm1\n"
1370
      "pxor %%mm7, %%mm0\n"
1371
      "pxor %%mm7, %%mm1\n"
1372
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1373
      "1:\n"
1374 115329f1 Diego Biurrun
1375 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1376 115329f1 Diego Biurrun
1377 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1378 115329f1 Diego Biurrun
1379 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1380
      "jnz 1b\n"
1381
1382
      "movq %%mm6,%%mm0\n"
1383
      "psrlq $32, %%mm6\n"
1384
      "paddw %%mm6,%%mm0\n"
1385
      "movq %%mm0,%%mm6\n"
1386
      "psrlq $16, %%mm0\n"
1387
      "paddw %%mm6,%%mm0\n"
1388
      "movd %%mm0,%2\n"
1389 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1390 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1391 622348f9 Michael Niedermayer
      : "%ecx");
1392
    return tmp & 0x7FFF;
1393
}
1394
#undef SUM
1395
1396
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1397
    int tmp;
1398 115329f1 Diego Biurrun
1399 622348f9 Michael Niedermayer
    assert( (((int)pix1) & 7) == 0);
1400
    assert( (((int)pix2) & 7) == 0);
1401
    assert((line_size &7) ==0);
1402 115329f1 Diego Biurrun
1403 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1404
      "movq (%0)," #out0 "\n"\
1405
      "movq (%1),%%mm2\n"\
1406
      "movq 8(%0)," #out1 "\n"\
1407
      "movq 8(%1),%%mm3\n"\
1408 053dea12 Aurelien Jacobs
      "add %3,%0\n"\
1409
      "add %3,%1\n"\
1410 622348f9 Michael Niedermayer
      "psubb %%mm2, " #out0 "\n"\
1411
      "psubb %%mm3, " #out1 "\n"\
1412
      "pxor %%mm7, " #out0 "\n"\
1413
      "pxor %%mm7, " #out1 "\n"\
1414
      "psadbw " #out0 ", " #in0 "\n"\
1415
      "psadbw " #out1 ", " #in1 "\n"\
1416
      "paddw " #in1 ", " #in0 "\n"\
1417
      "paddw " #in0 ", %%mm6\n"
1418
1419
  asm volatile (
1420
      "movl %4,%%ecx\n"
1421
      "pxor %%mm6,%%mm6\n"
1422
      "pcmpeqw %%mm7,%%mm7\n"
1423
      "psllw $15, %%mm7\n"
1424
      "packsswb %%mm7, %%mm7\n"
1425
      "movq (%0),%%mm0\n"
1426
      "movq (%1),%%mm2\n"
1427
      "movq 8(%0),%%mm1\n"
1428
      "movq 8(%1),%%mm3\n"
1429 053dea12 Aurelien Jacobs
      "add %3,%0\n"
1430
      "add %3,%1\n"
1431 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1432
      "psubb %%mm2, %%mm0\n"
1433
      "psubb %%mm3, %%mm1\n"
1434
      "pxor %%mm7, %%mm0\n"
1435
      "pxor %%mm7, %%mm1\n"
1436
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1437
      "1:\n"
1438 115329f1 Diego Biurrun
1439 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1440 115329f1 Diego Biurrun
1441 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1442 115329f1 Diego Biurrun
1443 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1444
      "jnz 1b\n"
1445
1446
      "movd %%mm6,%2\n"
1447 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1448 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1449 622348f9 Michael Niedermayer
      : "%ecx");
1450
    return tmp;
1451
}
1452
#undef SUM
1453
1454 11f18faf Michael Niedermayer
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1455 053dea12 Aurelien Jacobs
    long i=0;
1456 11f18faf Michael Niedermayer
    asm volatile(
1457 bb270c08 Diego Biurrun
        "1:                             \n\t"
1458
        "movq  (%2, %0), %%mm0          \n\t"
1459
        "movq  (%1, %0), %%mm1          \n\t"
1460
        "psubb %%mm0, %%mm1             \n\t"
1461
        "movq %%mm1, (%3, %0)           \n\t"
1462
        "movq 8(%2, %0), %%mm0          \n\t"
1463
        "movq 8(%1, %0), %%mm1          \n\t"
1464
        "psubb %%mm0, %%mm1             \n\t"
1465
        "movq %%mm1, 8(%3, %0)          \n\t"
1466
        "add $16, %0                    \n\t"
1467
        "cmp %4, %0                     \n\t"
1468
        " jb 1b                         \n\t"
1469 11f18faf Michael Niedermayer
        : "+r" (i)
1470 053dea12 Aurelien Jacobs
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1471 11f18faf Michael Niedermayer
    );
1472
    for(; i<w; i++)
1473
        dst[i+0] = src1[i+0]-src2[i+0];
1474
}
1475 84705403 Michael Niedermayer
1476
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1477 053dea12 Aurelien Jacobs
    long i=0;
1478 84705403 Michael Niedermayer
    uint8_t l, lt;
1479 115329f1 Diego Biurrun
1480 84705403 Michael Niedermayer
    asm volatile(
1481 bb270c08 Diego Biurrun
        "1:                             \n\t"
1482
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1483
        "movq  (%1, %0), %%mm1          \n\t" // T
1484
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1485
        "movq  (%2, %0), %%mm3          \n\t" // X
1486
        "movq %%mm2, %%mm4              \n\t" // L
1487
        "psubb %%mm0, %%mm2             \n\t"
1488
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1489
        "movq %%mm4, %%mm5              \n\t" // L
1490
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1491
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1492
        "pminub %%mm2, %%mm4            \n\t"
1493
        "pmaxub %%mm1, %%mm4            \n\t"
1494
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1495
        "movq %%mm3, (%3, %0)           \n\t"
1496
        "add $8, %0                     \n\t"
1497
        "cmp %4, %0                     \n\t"
1498
        " jb 1b                         \n\t"
1499 84705403 Michael Niedermayer
        : "+r" (i)
1500 053dea12 Aurelien Jacobs
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1501 84705403 Michael Niedermayer
    );
1502
1503
    l= *left;
1504
    lt= *left_top;
1505 115329f1 Diego Biurrun
1506 84705403 Michael Niedermayer
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1507 115329f1 Diego Biurrun
1508 84705403 Michael Niedermayer
    *left_top= src1[w-1];
1509
    *left    = src2[w-1];
1510
}
1511
1512 561f940c Loren Merritt
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1513
    "mov"#m" "#p1", "#a"              \n\t"\
1514
    "mov"#m" "#p2", "#t"              \n\t"\
1515
    "punpcklbw "#a", "#t"             \n\t"\
1516
    "punpcklbw "#a", "#a"             \n\t"\
1517
    "psubw     "#t", "#a"             \n\t"\
1518
1519
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1520
    uint8_t *p1b=p1, *p2b=p2;\
1521
    asm volatile(\
1522
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1523
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1524
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1525
        "add %4, %1                   \n\t"\
1526
        "add %4, %2                   \n\t"\
1527
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1528
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1529
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1530
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1531
        "mov"#m1" "#mm"0, %0          \n\t"\
1532
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1533
        "mov"#m1" %0, "#mm"0          \n\t"\
1534
        : "=m"(temp), "+r"(p1b), "+r"(p2b)\
1535
        : "r"((long)stride), "r"((long)stride*3)\
1536
    );\
1537
}
1538
1539
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1540
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1541
1542
#ifdef ARCH_X86_64
1543
// permutes 01234567 -> 05736421
1544
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1545
    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1546
    SBUTTERFLY(c,d,b,wd,dqa)\
1547
    SBUTTERFLY(e,f,d,wd,dqa)\
1548
    SBUTTERFLY(g,h,f,wd,dqa)\
1549
    SBUTTERFLY(a,c,h,dq,dqa)\
1550
    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1551
    SBUTTERFLY(e,g,b,dq,dqa)\
1552
    SBUTTERFLY(d,f,g,dq,dqa)\
1553
    SBUTTERFLY(a,e,f,qdq,dqa)\
1554
    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1555
    SBUTTERFLY(h,b,d,qdq,dqa)\
1556
    SBUTTERFLY(c,g,b,qdq,dqa)\
1557
    "movdqa %%xmm8, "#g"              \n\t"
1558
#else
1559
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1560
    "movdqa "#h", "#t"                \n\t"\
1561
    SBUTTERFLY(a,b,h,wd,dqa)\
1562
    "movdqa "#h", 16"#t"              \n\t"\
1563
    "movdqa "#t", "#h"                \n\t"\
1564
    SBUTTERFLY(c,d,b,wd,dqa)\
1565
    SBUTTERFLY(e,f,d,wd,dqa)\
1566
    SBUTTERFLY(g,h,f,wd,dqa)\
1567
    SBUTTERFLY(a,c,h,dq,dqa)\
1568
    "movdqa "#h", "#t"                \n\t"\
1569
    "movdqa 16"#t", "#h"              \n\t"\
1570
    SBUTTERFLY(h,b,c,dq,dqa)\
1571
    SBUTTERFLY(e,g,b,dq,dqa)\
1572
    SBUTTERFLY(d,f,g,dq,dqa)\
1573
    SBUTTERFLY(a,e,f,qdq,dqa)\
1574
    SBUTTERFLY(h,d,e,qdq,dqa)\
1575
    "movdqa "#h", 16"#t"              \n\t"\
1576
    "movdqa "#t", "#h"                \n\t"\
1577
    SBUTTERFLY(h,b,d,qdq,dqa)\
1578
    SBUTTERFLY(c,g,b,qdq,dqa)\
1579
    "movdqa 16"#t", "#g"              \n\t"
1580
#endif
1581
1582 8e0a3db7 Michael Niedermayer
#define LBUTTERFLY2(a1,b1,a2,b2)\
1583 bb270c08 Diego Biurrun
    "paddw " #b1 ", " #a1 "           \n\t"\
1584
    "paddw " #b2 ", " #a2 "           \n\t"\
1585
    "paddw " #b1 ", " #b1 "           \n\t"\
1586
    "paddw " #b2 ", " #b2 "           \n\t"\
1587
    "psubw " #a1 ", " #b1 "           \n\t"\
1588
    "psubw " #a2 ", " #b2 "           \n\t"
1589 1457ab52 Michael Niedermayer
1590 561f940c Loren Merritt
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1591
        LBUTTERFLY2(m0, m1, m2, m3)\
1592
        LBUTTERFLY2(m4, m5, m6, m7)\
1593
        LBUTTERFLY2(m0, m2, m1, m3)\
1594
        LBUTTERFLY2(m4, m6, m5, m7)\
1595
        LBUTTERFLY2(m0, m4, m1, m5)\
1596
        LBUTTERFLY2(m2, m6, m3, m7)\
1597
1598
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1599 1457ab52 Michael Niedermayer
1600 5adf43e4 Loren Merritt
#define MMABS_MMX(a,z)\
1601 bb270c08 Diego Biurrun
    "pxor " #z ", " #z "              \n\t"\
1602
    "pcmpgtw " #a ", " #z "           \n\t"\
1603
    "pxor " #z ", " #a "              \n\t"\
1604
    "psubw " #z ", " #a "             \n\t"
1605 1457ab52 Michael Niedermayer
1606 8e0a3db7 Michael Niedermayer
#define MMABS_MMX2(a,z)\
1607 bb270c08 Diego Biurrun
    "pxor " #z ", " #z "              \n\t"\
1608
    "psubw " #a ", " #z "             \n\t"\
1609
    "pmaxsw " #z ", " #a "            \n\t"
1610 8e0a3db7 Michael Niedermayer
1611 561f940c Loren Merritt
#define MMABS_SSSE3(a,z)\
1612
    "pabsw " #a ", " #a "             \n\t"
1613 5adf43e4 Loren Merritt
1614 561f940c Loren Merritt
#define MMABS_SUM(a,z, sum)\
1615
    MMABS(a,z)\
1616 bb270c08 Diego Biurrun
    "paddusw " #a ", " #sum "         \n\t"
1617 115329f1 Diego Biurrun
1618 561f940c Loren Merritt
#define MMABS_SUM_8x8_NOSPILL\
1619
    MMABS(%%xmm0, %%xmm8)\
1620
    MMABS(%%xmm1, %%xmm9)\
1621
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1622
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1623
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1624
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1625
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1626
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1627
    "paddusw %%xmm1, %%xmm0           \n\t"
1628
1629
#ifdef ARCH_X86_64
1630
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1631
#else
1632
#define MMABS_SUM_8x8_SSE2\
1633
    "movdqa %%xmm7, (%1)              \n\t"\
1634
    MMABS(%%xmm0, %%xmm7)\
1635
    MMABS(%%xmm1, %%xmm7)\
1636
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1637
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1638
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1639
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1640
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1641
    "movdqa (%1), %%xmm2              \n\t"\
1642
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1643
    "paddusw %%xmm1, %%xmm0           \n\t"
1644
#endif
1645
1646 1457ab52 Michael Niedermayer
#define LOAD4(o, a, b, c, d)\
1647 561f940c Loren Merritt
    "movq "#o"(%1),    "#a"           \n\t"\
1648
    "movq "#o"+8(%1),  "#b"           \n\t"\
1649
    "movq "#o"+16(%1), "#c"           \n\t"\
1650
    "movq "#o"+24(%1), "#d"           \n\t"\
1651 1457ab52 Michael Niedermayer
1652
#define STORE4(o, a, b, c, d)\
1653 561f940c Loren Merritt
    "movq "#a", "#o"(%1)              \n\t"\
1654
    "movq "#b", "#o"+8(%1)            \n\t"\
1655
    "movq "#c", "#o"+16(%1)           \n\t"\
1656
    "movq "#d", "#o"+24(%1)           \n\t"\
1657 1457ab52 Michael Niedermayer
1658 1edbfe19 Loren Merritt
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1659
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1660
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1661 5adf43e4 Loren Merritt
#define HSUM_MMX(a, t, dst)\
1662
    "movq "#a", "#t"                  \n\t"\
1663
    "psrlq $32, "#a"                  \n\t"\
1664
    "paddusw "#t", "#a"               \n\t"\
1665
    "movq "#a", "#t"                  \n\t"\
1666
    "psrlq $16, "#a"                  \n\t"\
1667
    "paddusw "#t", "#a"               \n\t"\
1668
    "movd "#a", "#dst"                \n\t"\
1669
1670
#define HSUM_MMX2(a, t, dst)\
1671
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1672
    "paddusw "#t", "#a"               \n\t"\
1673
    "pshufw $0x01, "#a", "#t"         \n\t"\
1674
    "paddusw "#t", "#a"               \n\t"\
1675
    "movd "#a", "#dst"                \n\t"\
1676
1677 561f940c Loren Merritt
#define HSUM_SSE2(a, t, dst)\
1678
    "movhlps "#a", "#t"               \n\t"\
1679
    "paddusw "#t", "#a"               \n\t"\
1680
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1681
    "paddusw "#t", "#a"               \n\t"\
1682
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1683
    "paddusw "#t", "#a"               \n\t"\
1684
    "movd "#a", "#dst"                \n\t"\
1685
1686 5adf43e4 Loren Merritt
#define HADAMARD8_DIFF_MMX(cpu) \
1687
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1688 561f940c Loren Merritt
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1689
    int sum;\
1690 5adf43e4 Loren Merritt
\
1691
    assert(h==8);\
1692
\
1693 561f940c Loren Merritt
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1694 5adf43e4 Loren Merritt
\
1695
    asm volatile(\
1696
        HADAMARD48\
1697
\
1698 561f940c Loren Merritt
        "movq %%mm7, 96(%1)             \n\t"\
1699 5adf43e4 Loren Merritt
\
1700
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1701
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1702
\
1703 561f940c Loren Merritt
        "movq 96(%1), %%mm7             \n\t"\
1704 5adf43e4 Loren Merritt
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1705
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1706
\
1707 561f940c Loren Merritt
        : "=r" (sum)\
1708
        : "r"(temp)\
1709
    );\
1710
\
1711
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1712 5adf43e4 Loren Merritt
\
1713 561f940c Loren Merritt
    asm volatile(\
1714 5adf43e4 Loren Merritt
        HADAMARD48\
1715
\
1716 561f940c Loren Merritt
        "movq %%mm7, 96(%1)             \n\t"\
1717 5adf43e4 Loren Merritt
\
1718
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1719 561f940c Loren Merritt
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1720 5adf43e4 Loren Merritt
\
1721 561f940c Loren Merritt
        "movq 96(%1), %%mm7             \n\t"\
1722 5adf43e4 Loren Merritt
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1723
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1724
        "movq %%mm6, %%mm7              \n\t"\
1725
        "movq %%mm0, %%mm6              \n\t"\
1726
\
1727
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1728
\
1729
        HADAMARD48\
1730
        "movq %%mm7, 64(%1)             \n\t"\
1731
        MMABS(%%mm0, %%mm7)\
1732 561f940c Loren Merritt
        MMABS(%%mm1, %%mm7)\
1733 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1734 561f940c Loren Merritt
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1735 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1736 561f940c Loren Merritt
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1737 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1738 561f940c Loren Merritt
        "movq 64(%1), %%mm2             \n\t"\
1739
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1740
        "paddusw %%mm1, %%mm0           \n\t"\
1741 5adf43e4 Loren Merritt
        "movq %%mm0, 64(%1)             \n\t"\
1742
\
1743
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1744 561f940c Loren Merritt
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1745 5adf43e4 Loren Merritt
\
1746
        HADAMARD48\
1747
        "movq %%mm7, (%1)               \n\t"\
1748
        MMABS(%%mm0, %%mm7)\
1749 561f940c Loren Merritt
        MMABS(%%mm1, %%mm7)\
1750 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1751 561f940c Loren Merritt
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1752 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1753 561f940c Loren Merritt
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1754 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1755 561f940c Loren Merritt
        "movq (%1), %%mm2               \n\t"\
1756
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1757
        "paddusw 64(%1), %%mm0          \n\t"\
1758
        "paddusw %%mm1, %%mm0           \n\t"\
1759 5adf43e4 Loren Merritt
\
1760
        HSUM(%%mm0, %%mm1, %0)\
1761
\
1762
        : "=r" (sum)\
1763
        : "r"(temp)\
1764
    );\
1765
    return sum&0xFFFF;\
1766 561f940c Loren Merritt
}\
1767
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1768
1769
#define HADAMARD8_DIFF_SSE2(cpu) \
1770
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1771
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1772
    int sum;\
1773
\
1774
    assert(h==8);\
1775
\
1776
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1777
\
1778
    asm volatile(\
1779
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1780
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1781
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1782
        MMABS_SUM_8x8\
1783
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1784
        : "=r" (sum)\
1785
        : "r"(temp)\
1786
    );\
1787
    return sum&0xFFFF;\
1788
}\
1789
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1790 8e0a3db7 Michael Niedermayer
1791 5adf43e4 Loren Merritt
#define MMABS(a,z)         MMABS_MMX(a,z)
1792
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1793
HADAMARD8_DIFF_MMX(mmx)
1794
#undef MMABS
1795
#undef HSUM
1796
1797
#define MMABS(a,z)         MMABS_MMX2(a,z)
1798 561f940c Loren Merritt
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1799 5adf43e4 Loren Merritt
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1800
HADAMARD8_DIFF_MMX(mmx2)
1801 561f940c Loren Merritt
HADAMARD8_DIFF_SSE2(sse2)
1802 5adf43e4 Loren Merritt
#undef MMABS
1803 561f940c Loren Merritt
#undef MMABS_SUM_8x8
1804 5adf43e4 Loren Merritt
#undef HSUM
1805 8e0a3db7 Michael Niedermayer
1806 561f940c Loren Merritt
#ifdef HAVE_SSSE3
1807
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1808
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1809
HADAMARD8_DIFF_SSE2(ssse3)
1810
#undef MMABS
1811
#undef MMABS_SUM_8x8
1812
#endif
1813 59006372 Loren Merritt
1814 1edbfe19 Loren Merritt
#define DCT_SAD4(m,mm,o)\
1815
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1816
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1817
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1818
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1819
    MMABS_SUM(mm##2, mm##6, mm##0)\
1820
    MMABS_SUM(mm##3, mm##7, mm##1)\
1821
    MMABS_SUM(mm##4, mm##6, mm##0)\
1822
    MMABS_SUM(mm##5, mm##7, mm##1)\
1823
1824
#define DCT_SAD_MMX\
1825
    "pxor %%mm0, %%mm0                \n\t"\
1826
    "pxor %%mm1, %%mm1                \n\t"\
1827
    DCT_SAD4(q, %%mm, 0)\
1828
    DCT_SAD4(q, %%mm, 8)\
1829
    DCT_SAD4(q, %%mm, 64)\
1830
    DCT_SAD4(q, %%mm, 72)\
1831
    "paddusw %%mm1, %%mm0             \n\t"\
1832
    HSUM(%%mm0, %%mm1, %0)
1833
1834
#define DCT_SAD_SSE2\
1835
    "pxor %%xmm0, %%xmm0              \n\t"\
1836
    "pxor %%xmm1, %%xmm1              \n\t"\
1837
    DCT_SAD4(dqa, %%xmm, 0)\
1838
    DCT_SAD4(dqa, %%xmm, 64)\
1839
    "paddusw %%xmm1, %%xmm0           \n\t"\
1840
    HSUM(%%xmm0, %%xmm1, %0)
1841
1842
#define DCT_SAD_FUNC(cpu) \
1843
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1844
    int sum;\
1845
    asm volatile(\
1846
        DCT_SAD\
1847
        :"=r"(sum)\
1848
        :"r"(block)\
1849
    );\
1850
    return sum&0xFFFF;\
1851
}
1852
1853
#define DCT_SAD       DCT_SAD_MMX
1854
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1855
#define MMABS(a,z)    MMABS_MMX(a,z)
1856
DCT_SAD_FUNC(mmx)
1857
#undef MMABS
1858
#undef HSUM
1859
1860
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1861
#define MMABS(a,z)    MMABS_MMX2(a,z)
1862
DCT_SAD_FUNC(mmx2)
1863
#undef HSUM
1864
#undef DCT_SAD
1865
1866
#define DCT_SAD       DCT_SAD_SSE2
1867
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1868
DCT_SAD_FUNC(sse2)
1869
#undef MMABS
1870
1871
#ifdef HAVE_SSSE3
1872
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1873
DCT_SAD_FUNC(ssse3)
1874
#undef MMABS
1875
#endif
1876
#undef HSUM
1877
#undef DCT_SAD
1878
1879 a00177a9 Måns Rullgård
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1880 59006372 Loren Merritt
    int sum;
1881
    long i=size;
1882
    asm volatile(
1883
        "pxor %%mm4, %%mm4 \n"
1884
        "1: \n"
1885
        "sub $8, %0 \n"
1886
        "movq (%2,%0), %%mm2 \n"
1887
        "movq (%3,%0,2), %%mm0 \n"
1888
        "movq 8(%3,%0,2), %%mm1 \n"
1889
        "punpckhbw %%mm2, %%mm3 \n"
1890
        "punpcklbw %%mm2, %%mm2 \n"
1891
        "psraw $8, %%mm3 \n"
1892
        "psraw $8, %%mm2 \n"
1893
        "psubw %%mm3, %%mm1 \n"
1894
        "psubw %%mm2, %%mm0 \n"
1895
        "pmaddwd %%mm1, %%mm1 \n"
1896
        "pmaddwd %%mm0, %%mm0 \n"
1897
        "paddd %%mm1, %%mm4 \n"
1898
        "paddd %%mm0, %%mm4 \n"
1899
        "jg 1b \n"
1900
        "movq %%mm4, %%mm3 \n"
1901
        "psrlq $32, %%mm3 \n"
1902
        "paddd %%mm3, %%mm4 \n"
1903
        "movd %%mm4, %1 \n"
1904
        :"+r"(i), "=r"(sum)
1905
        :"r"(pix1), "r"(pix2)
1906
    );
1907
    return sum;
1908
}
1909
1910 764ef400 Mike Melanson
#endif //CONFIG_ENCODERS
1911 11f18faf Michael Niedermayer
1912 3178ee4c Michael Niedermayer
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1913
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1914
1915 826f429a Michael Niedermayer
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1916 bb270c08 Diego Biurrun
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1917
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1918
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1919
        "movq "#in7", " #m3 "             \n\t" /* d */\
1920
        "movq "#in0", %%mm5               \n\t" /* D */\
1921
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1922
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1923
        "movq "#in1", %%mm5               \n\t" /* C */\
1924
        "movq "#in2", %%mm6               \n\t" /* B */\
1925
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1926
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1927
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1928
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1929
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1930
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1931
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1932
        "psraw $5, %%mm5                  \n\t"\
1933
        "packuswb %%mm5, %%mm5            \n\t"\
1934 826f429a Michael Niedermayer
        OP(%%mm5, out, %%mm7, d)
1935
1936 3178ee4c Michael Niedermayer
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1937 5c91a675 Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1938 826f429a Michael Niedermayer
    uint64_t temp;\
1939
\
1940
    asm volatile(\
1941 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7                \n\t"\
1942
        "1:                               \n\t"\
1943
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1944
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1945
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1946
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1947
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1948
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1949
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1950
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1951
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1952
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1953
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1954
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1955
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1956
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1957
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1958
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1959
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1960
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1961
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1962
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1963
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1964
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1965
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1966
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1967
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1968
        "paddw %6, %%mm6                  \n\t"\
1969
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1970
        "psraw $5, %%mm0                  \n\t"\
1971
        "movq %%mm0, %5                   \n\t"\
1972 826f429a Michael Niedermayer
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1973
        \
1974 bb270c08 Diego Biurrun
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1975
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1976
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1977
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1978
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1979
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1980
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1981
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1982
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1983
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1984
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1985
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1986
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1987
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1988
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1989
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1990
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1991
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1992
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1993
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
1994
        "paddw %6, %%mm1                  \n\t"\
1995
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
1996
        "psraw $5, %%mm3                  \n\t"\
1997
        "movq %5, %%mm1                   \n\t"\
1998
        "packuswb %%mm3, %%mm1            \n\t"\
1999 3178ee4c Michael Niedermayer
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
2000 826f429a Michael Niedermayer
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2001
        \
2002 bb270c08 Diego Biurrun
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
2003
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
2004
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
2005
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
2006
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
2007
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
2008
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
2009
        "paddw %%mm1, %%mm5               \n\t" /* b */\
2010
        "paddw %%mm4, %%mm0               \n\t" /* c */\
2011
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2012
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
2013
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
2014
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
2015
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
2016
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
2017
        "paddw %%mm3, %%mm2               \n\t" /* d */\
2018
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
2019
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
2020
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
2021
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
2022
        "paddw %%mm2, %%mm6               \n\t" /* a */\
2023
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2024
        "paddw %6, %%mm0                  \n\t"\
2025
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2026
        "psraw $5, %%mm0                  \n\t"\
2027 826f429a Michael Niedermayer
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2028
        \
2029 bb270c08 Diego Biurrun
        "paddw %%mm5, %%mm3               \n\t" /* a */\
2030
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
2031
        "paddw %%mm4, %%mm6               \n\t" /* b */\
2032
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
2033
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
2034
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2035
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2036
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2037
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2038
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2039
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2040
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2041
        "paddw %6, %%mm4                  \n\t"\
2042
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2043
        "psraw $5, %%mm4                  \n\t"\
2044
        "packuswb %%mm4, %%mm0            \n\t"\
2045 3178ee4c Michael Niedermayer
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2046 826f429a Michael Niedermayer
        \
2047 bb270c08 Diego Biurrun
        "add %3, %0                       \n\t"\
2048
        "add %4, %1                       \n\t"\
2049
        "decl %2                          \n\t"\
2050
        " jnz 1b                          \n\t"\
2051 5a508a98 Michael Niedermayer
        : "+a"(src), "+c"(dst), "+m"(h)\
2052 053dea12 Aurelien Jacobs
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2053 0b093b6f Michael Niedermayer
        : "memory"\
2054 826f429a Michael Niedermayer
    );\
2055
}\
2056
\
2057
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2058
    int i;\
2059
    int16_t temp[16];\
2060
    /* quick HACK, XXX FIXME MUST be optimized */\
2061
    for(i=0; i<h; i++)\
2062
    {\
2063
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2064
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2065
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2066
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2067
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2068
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2069
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2070
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2071
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2072
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2073
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2074
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2075
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2076
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2077
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2078
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2079
        asm volatile(\
2080 bb270c08 Diego Biurrun
            "movq (%0), %%mm0               \n\t"\
2081
            "movq 8(%0), %%mm1              \n\t"\
2082
            "paddw %2, %%mm0                \n\t"\
2083
            "paddw %2, %%mm1                \n\t"\
2084
            "psraw $5, %%mm0                \n\t"\
2085
            "psraw $5, %%mm1                \n\t"\
2086
            "packuswb %%mm1, %%mm0          \n\t"\
2087 3178ee4c Michael Niedermayer
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2088 bb270c08 Diego Biurrun
            "movq 16(%0), %%mm0             \n\t"\
2089
            "movq 24(%0), %%mm1             \n\t"\
2090
            "paddw %2, %%mm0                \n\t"\
2091
            "paddw %2, %%mm1                \n\t"\
2092
            "psraw $5, %%mm0                \n\t"\
2093
            "psraw $5, %%mm1                \n\t"\
2094
            "packuswb %%mm1, %%mm0          \n\t"\
2095 3178ee4c Michael Niedermayer
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2096 826f429a Michael Niedermayer
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2097 0b093b6f Michael Niedermayer
            : "memory"\
2098 826f429a Michael Niedermayer
        );\
2099
        dst+=dstStride;\
2100
        src+=srcStride;\
2101
    }\
2102
}\
2103
\
2104 5c91a675 Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2105 826f429a Michael Niedermayer
    uint64_t temp;\
2106
\
2107
    asm volatile(\
2108 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7                \n\t"\
2109
        "1:                               \n\t"\
2110
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2111
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2112
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2113
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2114
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2115
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2116
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2117
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2118
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2119
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2120
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2121
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2122
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2123
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2124
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2125
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2126
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2127
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2128
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2129
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2130
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2131
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2132
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2133
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2134
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2135
        "paddw %6, %%mm6                  \n\t"\
2136
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2137
        "psraw $5, %%mm0                  \n\t"\
2138 826f429a Michael Niedermayer
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2139
        \
2140 bb270c08 Diego Biurrun
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2141
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2142
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2143
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2144
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2145
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2146
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2147
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2148
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2149
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2150
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2151
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2152
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2153
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2154
        "paddw %6, %%mm1                  \n\t"\
2155
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2156
        "psraw $5, %%mm3                  \n\t"\
2157
        "packuswb %%mm3, %%mm0            \n\t"\
2158 3178ee4c Michael Niedermayer
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2159 826f429a Michael Niedermayer
        \
2160 bb270c08 Diego Biurrun
        "add %3, %0                       \n\t"\
2161
        "add %4, %1                       \n\t"\
2162
        "decl %2                          \n\t"\
2163
        " jnz 1b                          \n\t"\
2164 5a508a98 Michael Niedermayer
        : "+a"(src), "+c"(dst), "+m"(h)\
2165 053dea12 Aurelien Jacobs
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2166 0b093b6f Michael Niedermayer
        : "memory"\
2167 826f429a Michael Niedermayer
    );\
2168
}\
2169
\
2170
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2171
    int i;\
2172
    int16_t temp[8];\
2173
    /* quick HACK, XXX FIXME MUST be optimized */\
2174
    for(i=0; i<h; i++)\
2175
    {\
2176
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2177
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2178
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2179
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2180
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2181
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2182
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2183
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2184
        asm volatile(\
2185 bb270c08 Diego Biurrun
            "movq (%0), %%mm0           \n\t"\
2186
            "movq 8(%0), %%mm1          \n\t"\
2187
            "paddw %2, %%mm0            \n\t"\
2188
            "paddw %2, %%mm1            \n\t"\
2189
            "psraw $5, %%mm0            \n\t"\
2190
            "psraw $5, %%mm1            \n\t"\
2191
            "packuswb %%mm1, %%mm0      \n\t"\
2192 3178ee4c Michael Niedermayer
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2193 826f429a Michael Niedermayer
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2194 0b093b6f Michael Niedermayer
            :"memory"\
2195 826f429a Michael Niedermayer
        );\
2196
        dst+=dstStride;\
2197
        src+=srcStride;\
2198
    }\
2199 3178ee4c Michael Niedermayer
}
2200
2201
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2202
\
2203
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204
    uint64_t temp[17*4];\
2205
    uint64_t *temp_ptr= temp;\
2206
    int count= 17;\
2207
\
2208
    /*FIXME unroll */\
2209
    asm volatile(\
2210 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7              \n\t"\
2211
        "1:                             \n\t"\