Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ fa9b873e

History | View | Annotate | Download (132 KB)

1 de6d9b64 Fabrice Bellard
/*
2
 * MMX optimized DSP utils
3 ff4ec49e Fabrice Bellard
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 8f2ab833 Michael Niedermayer
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 de6d9b64 Fabrice Bellard
 *
6 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9 ff4ec49e Fabrice Bellard
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
12 de6d9b64 Fabrice Bellard
 *
13 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
14 de6d9b64 Fabrice Bellard
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ff4ec49e Fabrice Bellard
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17 de6d9b64 Fabrice Bellard
 *
18 ff4ec49e Fabrice Bellard
 * You should have received a copy of the GNU Lesser General Public
19 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
20 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 de6d9b64 Fabrice Bellard
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24
25 b550bfaa Ronald S. Bultje
#include "dsputil.h"
26 182f56cb Aurelien Jacobs
#include "dsputil_mmx.h"
27 b550bfaa Ronald S. Bultje
#include "simple_idct.h"
28
#include "mpegvideo.h"
29 9c39071d Luca Abeni
#include "x86_cpu.h"
30 f9ed9d85 Mike Melanson
#include "mmx.h"
31 5b0b7054 Aurelien Jacobs
#include "vp3dsp_mmx.h"
32
#include "vp3dsp_sse2.h"
33 eb75a698 Aurelien Jacobs
#include "h263.h"
34 de6d9b64 Fabrice Bellard
35 622348f9 Michael Niedermayer
//#undef NDEBUG
36
//#include <assert.h>
37
38 84740d59 Michael Niedermayer
extern void ff_idct_xvid_mmx(short *block);
39
extern void ff_idct_xvid_mmx2(short *block);
40 359f98de Michael Niedermayer
41 486497e0 Måns Rullgård
int mm_flags; /* multimedia extension flags */
42 1457ab52 Michael Niedermayer
43 de6d9b64 Fabrice Bellard
/* pixel operations */
44 43de5065 Aurelien Jacobs
DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45
DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
46 de6d9b64 Fabrice Bellard
47 182f56cb Aurelien Jacobs
DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48 21bb884f Michael Niedermayer
{0x8000000080000000ULL, 0x8000000080000000ULL};
49
50 182f56cb Aurelien Jacobs
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
51
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
52
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5  ) = 0x0005000500050005ULL;
53
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8  ) = 0x0008000800080008ULL;
54
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
56
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 fa9b873e Loren Merritt
DECLARE_ALIGNED_16(const xmm_t,    ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
58 182f56cb Aurelien Jacobs
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61
DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62 826f429a Michael Niedermayer
63 182f56cb Aurelien Jacobs
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
64
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
65
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
66
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
67
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
68
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
69 359f98de Michael Niedermayer
70 182f56cb Aurelien Jacobs
DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71
DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
72 6810b93a Loren Merritt
73 4454dc1b John Dalgliesh
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74 d6a4c0b1 Zdenek Kabelac
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
75
76 fca0f0e5 Zdenek Kabelac
#define MOVQ_WONE(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79
    "psrlw $15, %%" #regd ::)
80
81
#define MOVQ_BFE(regd) \
82
    __asm __volatile ( \
83
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
85
86 d6a4c0b1 Zdenek Kabelac
#ifndef PIC
87 43de5065 Aurelien Jacobs
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
89 d6a4c0b1 Zdenek Kabelac
#else
90
// for shared library it's better to use this way for accessing constants
91
// pcmpeqd -> -1
92 fca0f0e5 Zdenek Kabelac
#define MOVQ_BONE(regd) \
93 d6a4c0b1 Zdenek Kabelac
    __asm __volatile ( \
94 fca0f0e5 Zdenek Kabelac
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95
    "psrlw $15, %%" #regd " \n\t" \
96
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
97 d6a4c0b1 Zdenek Kabelac
98
#define MOVQ_WTWO(regd) \
99
    __asm __volatile ( \
100 fca0f0e5 Zdenek Kabelac
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101
    "psrlw $15, %%" #regd " \n\t" \
102
    "psllw $1, %%" #regd " \n\t"::)
103 a7bd8797 Michael Niedermayer
104 d6a4c0b1 Zdenek Kabelac
#endif
105
106 fca0f0e5 Zdenek Kabelac
// using regr as temporary and for the output result
107 def60345 Zdenek Kabelac
// first argument is unmodifed and second is trashed
108 39825f31 Zdenek Kabelac
// regfe is supposed to contain 0xfefefefefefefefe
109
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
111
    "pand " #regb ", " #regr "  \n\t"\
112
    "pxor " #rega ", " #regb "  \n\t"\
113
    "pand " #regfe "," #regb "  \n\t"\
114
    "psrlq $1, " #regb "        \n\t"\
115
    "paddb " #regb ", " #regr " \n\t"
116 def60345 Zdenek Kabelac
117 39825f31 Zdenek Kabelac
#define PAVGB_MMX(rega, regb, regr, regfe) \
118 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
119
    "por  " #regb ", " #regr "  \n\t"\
120
    "pxor " #rega ", " #regb "  \n\t"\
121
    "pand " #regfe "," #regb "  \n\t"\
122
    "psrlq $1, " #regb "        \n\t"\
123
    "psubb " #regb ", " #regr " \n\t"
124 def60345 Zdenek Kabelac
125 39825f31 Zdenek Kabelac
// mm6 is supposed to contain 0xfefefefefefefefe
126 6aa6ea8e Zdenek Kabelac
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
127 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
128
    "movq " #regc ", " #regp "  \n\t"\
129
    "pand " #regb ", " #regr "  \n\t"\
130
    "pand " #regd ", " #regp "  \n\t"\
131
    "pxor " #rega ", " #regb "  \n\t"\
132
    "pxor " #regc ", " #regd "  \n\t"\
133
    "pand %%mm6, " #regb "      \n\t"\
134
    "pand %%mm6, " #regd "      \n\t"\
135
    "psrlq $1, " #regb "        \n\t"\
136
    "psrlq $1, " #regd "        \n\t"\
137
    "paddb " #regb ", " #regr " \n\t"\
138
    "paddb " #regd ", " #regp " \n\t"
139 6aa6ea8e Zdenek Kabelac
140
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141 bb270c08 Diego Biurrun
    "movq " #rega ", " #regr "  \n\t"\
142
    "movq " #regc ", " #regp "  \n\t"\
143
    "por  " #regb ", " #regr "  \n\t"\
144
    "por  " #regd ", " #regp "  \n\t"\
145
    "pxor " #rega ", " #regb "  \n\t"\
146
    "pxor " #regc ", " #regd "  \n\t"\
147
    "pand %%mm6, " #regb "      \n\t"\
148
    "pand %%mm6, " #regd "      \n\t"\
149
    "psrlq $1, " #regd "        \n\t"\
150
    "psrlq $1, " #regb "        \n\t"\
151
    "psubb " #regb ", " #regr " \n\t"\
152
    "psubb " #regd ", " #regp " \n\t"
153 6aa6ea8e Zdenek Kabelac
154 91abb473 Zdenek Kabelac
/***********************************/
155
/* MMX no rounding */
156
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157 fca0f0e5 Zdenek Kabelac
#define SET_RND  MOVQ_WONE
158 bb270c08 Diego Biurrun
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
160 fca0f0e5 Zdenek Kabelac
161 91abb473 Zdenek Kabelac
#include "dsputil_mmx_rnd.h"
162
163
#undef DEF
164 fca0f0e5 Zdenek Kabelac
#undef SET_RND
165 6aa6ea8e Zdenek Kabelac
#undef PAVGBP
166 39825f31 Zdenek Kabelac
#undef PAVGB
167 91abb473 Zdenek Kabelac
/***********************************/
168
/* MMX rounding */
169
170
#define DEF(x, y) x ## _ ## y ##_mmx
171 fca0f0e5 Zdenek Kabelac
#define SET_RND  MOVQ_WTWO
172 bb270c08 Diego Biurrun
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
173
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
174 fca0f0e5 Zdenek Kabelac
175 91abb473 Zdenek Kabelac
#include "dsputil_mmx_rnd.h"
176
177
#undef DEF
178 fca0f0e5 Zdenek Kabelac
#undef SET_RND
179 6aa6ea8e Zdenek Kabelac
#undef PAVGBP
180 39825f31 Zdenek Kabelac
#undef PAVGB
181 a7bd8797 Michael Niedermayer
182 de6d9b64 Fabrice Bellard
/***********************************/
183
/* 3Dnow specific */
184
185
#define DEF(x) x ## _3dnow
186
#define PAVGB "pavgusb"
187
188
#include "dsputil_mmx_avg.h"
189
190
#undef DEF
191
#undef PAVGB
192
193
/***********************************/
194
/* MMX2 specific */
195
196 607dce96 Michael Niedermayer
#define DEF(x) x ## _mmx2
197 de6d9b64 Fabrice Bellard
198
/* Introduced only in MMX2 set */
199
#define PAVGB "pavgb"
200
201
#include "dsputil_mmx_avg.h"
202
203
#undef DEF
204
#undef PAVGB
205
206 b2f77586 Loren Merritt
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
207
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
208 6c01d006 Loren Merritt
#define put_pixels16_mmx2 put_pixels16_mmx
209
#define put_pixels8_mmx2 put_pixels8_mmx
210
#define put_pixels4_mmx2 put_pixels4_mmx
211
#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
212
#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
213
#define put_pixels16_3dnow put_pixels16_mmx
214
#define put_pixels8_3dnow put_pixels8_mmx
215
#define put_pixels4_3dnow put_pixels4_mmx
216
#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
217
#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
218
219 de6d9b64 Fabrice Bellard
/***********************************/
220
/* standard MMX */
221
222 764ef400 Mike Melanson
#ifdef CONFIG_ENCODERS
223 0c1a9eda Zdenek Kabelac
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
224 de6d9b64 Fabrice Bellard
{
225 607dce96 Michael Niedermayer
    asm volatile(
226 bb270c08 Diego Biurrun
        "mov $-128, %%"REG_a"           \n\t"
227
        "pxor %%mm7, %%mm7              \n\t"
228 4454dc1b John Dalgliesh
        ASMALIGN(4)
229 bb270c08 Diego Biurrun
        "1:                             \n\t"
230
        "movq (%0), %%mm0               \n\t"
231
        "movq (%0, %2), %%mm2           \n\t"
232
        "movq %%mm0, %%mm1              \n\t"
233
        "movq %%mm2, %%mm3              \n\t"
234
        "punpcklbw %%mm7, %%mm0         \n\t"
235
        "punpckhbw %%mm7, %%mm1         \n\t"
236
        "punpcklbw %%mm7, %%mm2         \n\t"
237
        "punpckhbw %%mm7, %%mm3         \n\t"
238
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
239
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
240
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
241
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
242
        "add %3, %0                     \n\t"
243
        "add $32, %%"REG_a"             \n\t"
244
        "js 1b                          \n\t"
245 607dce96 Michael Niedermayer
        : "+r" (pixels)
246 053dea12 Aurelien Jacobs
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
247
        : "%"REG_a
248 607dce96 Michael Niedermayer
    );
249 de6d9b64 Fabrice Bellard
}
250
251 0c1a9eda Zdenek Kabelac
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
252 9dbcbd92 Michael Niedermayer
{
253
    asm volatile(
254 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7              \n\t"
255
        "mov $-128, %%"REG_a"           \n\t"
256 4454dc1b John Dalgliesh
        ASMALIGN(4)
257 bb270c08 Diego Biurrun
        "1:                             \n\t"
258
        "movq (%0), %%mm0               \n\t"
259
        "movq (%1), %%mm2               \n\t"
260
        "movq %%mm0, %%mm1              \n\t"
261
        "movq %%mm2, %%mm3              \n\t"
262
        "punpcklbw %%mm7, %%mm0         \n\t"
263
        "punpckhbw %%mm7, %%mm1         \n\t"
264
        "punpcklbw %%mm7, %%mm2         \n\t"
265
        "punpckhbw %%mm7, %%mm3         \n\t"
266
        "psubw %%mm2, %%mm0             \n\t"
267
        "psubw %%mm3, %%mm1             \n\t"
268
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
269
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
270
        "add %3, %0                     \n\t"
271
        "add %3, %1                     \n\t"
272
        "add $16, %%"REG_a"             \n\t"
273
        "jnz 1b                         \n\t"
274 9dbcbd92 Michael Niedermayer
        : "+r" (s1), "+r" (s2)
275 053dea12 Aurelien Jacobs
        : "r" (block+64), "r" ((long)stride)
276
        : "%"REG_a
277 9dbcbd92 Michael Niedermayer
    );
278
}
279 764ef400 Mike Melanson
#endif //CONFIG_ENCODERS
280 9dbcbd92 Michael Niedermayer
281 0c1a9eda Zdenek Kabelac
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
282 de6d9b64 Fabrice Bellard
{
283
    const DCTELEM *p;
284 0c1a9eda Zdenek Kabelac
    uint8_t *pix;
285 de6d9b64 Fabrice Bellard
286
    /* read the pixels */
287
    p = block;
288
    pix = pixels;
289 d6a4c0b1 Zdenek Kabelac
    /* unrolled loop */
290 bb270c08 Diego Biurrun
        __asm __volatile(
291
                "movq   %3, %%mm0               \n\t"
292
                "movq   8%3, %%mm1              \n\t"
293
                "movq   16%3, %%mm2             \n\t"
294
                "movq   24%3, %%mm3             \n\t"
295
                "movq   32%3, %%mm4             \n\t"
296
                "movq   40%3, %%mm5             \n\t"
297
                "movq   48%3, %%mm6             \n\t"
298
                "movq   56%3, %%mm7             \n\t"
299
                "packuswb %%mm1, %%mm0          \n\t"
300
                "packuswb %%mm3, %%mm2          \n\t"
301
                "packuswb %%mm5, %%mm4          \n\t"
302
                "packuswb %%mm7, %%mm6          \n\t"
303
                "movq   %%mm0, (%0)             \n\t"
304
                "movq   %%mm2, (%0, %1)         \n\t"
305
                "movq   %%mm4, (%0, %1, 2)      \n\t"
306
                "movq   %%mm6, (%0, %2)         \n\t"
307
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
308
                :"memory");
309 de6d9b64 Fabrice Bellard
        pix += line_size*4;
310
        p += 32;
311 d6a4c0b1 Zdenek Kabelac
312
    // if here would be an exact copy of the code above
313
    // compiler would generate some very strange code
314
    // thus using "r"
315
    __asm __volatile(
316 bb270c08 Diego Biurrun
            "movq       (%3), %%mm0             \n\t"
317
            "movq       8(%3), %%mm1            \n\t"
318
            "movq       16(%3), %%mm2           \n\t"
319
            "movq       24(%3), %%mm3           \n\t"
320
            "movq       32(%3), %%mm4           \n\t"
321
            "movq       40(%3), %%mm5           \n\t"
322
            "movq       48(%3), %%mm6           \n\t"
323
            "movq       56(%3), %%mm7           \n\t"
324
            "packuswb %%mm1, %%mm0              \n\t"
325
            "packuswb %%mm3, %%mm2              \n\t"
326
            "packuswb %%mm5, %%mm4              \n\t"
327
            "packuswb %%mm7, %%mm6              \n\t"
328
            "movq       %%mm0, (%0)             \n\t"
329
            "movq       %%mm2, (%0, %1)         \n\t"
330
            "movq       %%mm4, (%0, %1, 2)      \n\t"
331
            "movq       %%mm6, (%0, %2)         \n\t"
332
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
333
            :"memory");
334 de6d9b64 Fabrice Bellard
}
335
336 68b51e58 Steve L'Homme
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
337 7daabccb Mike Melanson
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
338
339 f9ed9d85 Mike Melanson
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
340
{
341
    int i;
342
343
    movq_m2r(*vector128, mm1);
344
    for (i = 0; i < 8; i++) {
345
        movq_m2r(*(block), mm0);
346
        packsswb_m2r(*(block + 4), mm0);
347
        block += 8;
348
        paddb_r2r(mm1, mm0);
349
        movq_r2m(mm0, *pixels);
350
        pixels += line_size;
351
    }
352
}
353
354 0c1a9eda Zdenek Kabelac
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
355 de6d9b64 Fabrice Bellard
{
356
    const DCTELEM *p;
357 0c1a9eda Zdenek Kabelac
    uint8_t *pix;
358 de6d9b64 Fabrice Bellard
    int i;
359
360
    /* read the pixels */
361
    p = block;
362
    pix = pixels;
363 d6a4c0b1 Zdenek Kabelac
    MOVQ_ZERO(mm7);
364
    i = 4;
365 cd8e5f96 Zdenek Kabelac
    do {
366 bb270c08 Diego Biurrun
        __asm __volatile(
367
                "movq   (%2), %%mm0     \n\t"
368
                "movq   8(%2), %%mm1    \n\t"
369
                "movq   16(%2), %%mm2   \n\t"
370
                "movq   24(%2), %%mm3   \n\t"
371
                "movq   %0, %%mm4       \n\t"
372
                "movq   %1, %%mm6       \n\t"
373
                "movq   %%mm4, %%mm5    \n\t"
374
                "punpcklbw %%mm7, %%mm4 \n\t"
375
                "punpckhbw %%mm7, %%mm5 \n\t"
376
                "paddsw %%mm4, %%mm0    \n\t"
377
                "paddsw %%mm5, %%mm1    \n\t"
378
                "movq   %%mm6, %%mm5    \n\t"
379
                "punpcklbw %%mm7, %%mm6 \n\t"
380
                "punpckhbw %%mm7, %%mm5 \n\t"
381
                "paddsw %%mm6, %%mm2    \n\t"
382
                "paddsw %%mm5, %%mm3    \n\t"
383
                "packuswb %%mm1, %%mm0  \n\t"
384
                "packuswb %%mm3, %%mm2  \n\t"
385
                "movq   %%mm0, %0       \n\t"
386
                "movq   %%mm2, %1       \n\t"
387
                :"+m"(*pix), "+m"(*(pix+line_size))
388
                :"r"(p)
389
                :"memory");
390 de6d9b64 Fabrice Bellard
        pix += line_size*2;
391
        p += 16;
392 cd8e5f96 Zdenek Kabelac
    } while (--i);
393 de6d9b64 Fabrice Bellard
}
394
395 437525c4 Michael Niedermayer
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
396
{
397
    __asm __volatile(
398 bb270c08 Diego Biurrun
         "lea (%3, %3), %%"REG_a"       \n\t"
399 4454dc1b John Dalgliesh
         ASMALIGN(3)
400 bb270c08 Diego Biurrun
         "1:                            \n\t"
401
         "movd (%1), %%mm0              \n\t"
402
         "movd (%1, %3), %%mm1          \n\t"
403
         "movd %%mm0, (%2)              \n\t"
404
         "movd %%mm1, (%2, %3)          \n\t"
405
         "add %%"REG_a", %1             \n\t"
406
         "add %%"REG_a", %2             \n\t"
407
         "movd (%1), %%mm0              \n\t"
408
         "movd (%1, %3), %%mm1          \n\t"
409
         "movd %%mm0, (%2)              \n\t"
410
         "movd %%mm1, (%2, %3)          \n\t"
411
         "add %%"REG_a", %1             \n\t"
412
         "add %%"REG_a", %2             \n\t"
413
         "subl $4, %0                   \n\t"
414
         "jnz 1b                        \n\t"
415
         : "+g"(h), "+r" (pixels),  "+r" (block)
416
         : "r"((long)line_size)
417
         : "%"REG_a, "memory"
418
        );
419 437525c4 Michael Niedermayer
}
420
421 0c1a9eda Zdenek Kabelac
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
422 de6d9b64 Fabrice Bellard
{
423 39825f31 Zdenek Kabelac
    __asm __volatile(
424 bb270c08 Diego Biurrun
         "lea (%3, %3), %%"REG_a"       \n\t"
425 4454dc1b John Dalgliesh
         ASMALIGN(3)
426 bb270c08 Diego Biurrun
         "1:                            \n\t"
427
         "movq (%1), %%mm0              \n\t"
428
         "movq (%1, %3), %%mm1          \n\t"
429
         "movq %%mm0, (%2)              \n\t"
430
         "movq %%mm1, (%2, %3)          \n\t"
431
         "add %%"REG_a", %1             \n\t"
432
         "add %%"REG_a", %2             \n\t"
433
         "movq (%1), %%mm0              \n\t"
434
         "movq (%1, %3), %%mm1          \n\t"
435
         "movq %%mm0, (%2)              \n\t"
436
         "movq %%mm1, (%2, %3)          \n\t"
437
         "add %%"REG_a", %1             \n\t"
438
         "add %%"REG_a", %2             \n\t"
439
         "subl $4, %0                   \n\t"
440
         "jnz 1b                        \n\t"
441
         : "+g"(h), "+r" (pixels),  "+r" (block)
442
         : "r"((long)line_size)
443
         : "%"REG_a, "memory"
444
        );
445 de6d9b64 Fabrice Bellard
}
446
447 0c1a9eda Zdenek Kabelac
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
448 b3184779 Michael Niedermayer
{
449
    __asm __volatile(
450 bb270c08 Diego Biurrun
         "lea (%3, %3), %%"REG_a"       \n\t"
451 4454dc1b John Dalgliesh
         ASMALIGN(3)
452 bb270c08 Diego Biurrun
         "1:                            \n\t"
453
         "movq (%1), %%mm0              \n\t"
454
         "movq 8(%1), %%mm4             \n\t"
455
         "movq (%1, %3), %%mm1          \n\t"
456
         "movq 8(%1, %3), %%mm5         \n\t"
457
         "movq %%mm0, (%2)              \n\t"
458
         "movq %%mm4, 8(%2)             \n\t"
459
         "movq %%mm1, (%2, %3)          \n\t"
460
         "movq %%mm5, 8(%2, %3)         \n\t"
461
         "add %%"REG_a", %1             \n\t"
462
         "add %%"REG_a", %2             \n\t"
463
         "movq (%1), %%mm0              \n\t"
464
         "movq 8(%1), %%mm4             \n\t"
465
         "movq (%1, %3), %%mm1          \n\t"
466
         "movq 8(%1, %3), %%mm5         \n\t"
467
         "movq %%mm0, (%2)              \n\t"
468
         "movq %%mm4, 8(%2)             \n\t"
469
         "movq %%mm1, (%2, %3)          \n\t"
470
         "movq %%mm5, 8(%2, %3)         \n\t"
471
         "add %%"REG_a", %1             \n\t"
472
         "add %%"REG_a", %2             \n\t"
473
         "subl $4, %0                   \n\t"
474
         "jnz 1b                        \n\t"
475
         : "+g"(h), "+r" (pixels),  "+r" (block)
476
         : "r"((long)line_size)
477
         : "%"REG_a, "memory"
478
        );
479 b3184779 Michael Niedermayer
}
480
481 649c00c9 Michael Niedermayer
static void clear_blocks_mmx(DCTELEM *blocks)
482
{
483 39825f31 Zdenek Kabelac
    __asm __volatile(
484 bb270c08 Diego Biurrun
                "pxor %%mm7, %%mm7              \n\t"
485
                "mov $-128*6, %%"REG_a"         \n\t"
486
                "1:                             \n\t"
487
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
488
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
489
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
490
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
491
                "add $32, %%"REG_a"             \n\t"
492
                " js 1b                         \n\t"
493 053dea12 Aurelien Jacobs
                : : "r" (((uint8_t *)blocks)+128*6)
494
                : "%"REG_a
495 649c00c9 Michael Niedermayer
        );
496
}
497
498 764ef400 Mike Melanson
#ifdef CONFIG_ENCODERS
499 0c1a9eda Zdenek Kabelac
static int pix_sum16_mmx(uint8_t * pix, int line_size){
500 084c726b Michael Niedermayer
    const int h=16;
501
    int sum;
502 053dea12 Aurelien Jacobs
    long index= -line_size*h;
503 084c726b Michael Niedermayer
504
    __asm __volatile(
505 bb270c08 Diego Biurrun
                "pxor %%mm7, %%mm7              \n\t"
506
                "pxor %%mm6, %%mm6              \n\t"
507
                "1:                             \n\t"
508
                "movq (%2, %1), %%mm0           \n\t"
509
                "movq (%2, %1), %%mm1           \n\t"
510
                "movq 8(%2, %1), %%mm2          \n\t"
511
                "movq 8(%2, %1), %%mm3          \n\t"
512
                "punpcklbw %%mm7, %%mm0         \n\t"
513
                "punpckhbw %%mm7, %%mm1         \n\t"
514
                "punpcklbw %%mm7, %%mm2         \n\t"
515
                "punpckhbw %%mm7, %%mm3         \n\t"
516
                "paddw %%mm0, %%mm1             \n\t"
517
                "paddw %%mm2, %%mm3             \n\t"
518
                "paddw %%mm1, %%mm3             \n\t"
519
                "paddw %%mm3, %%mm6             \n\t"
520
                "add %3, %1                     \n\t"
521
                " js 1b                         \n\t"
522
                "movq %%mm6, %%mm5              \n\t"
523
                "psrlq $32, %%mm6               \n\t"
524
                "paddw %%mm5, %%mm6             \n\t"
525
                "movq %%mm6, %%mm5              \n\t"
526
                "psrlq $16, %%mm6               \n\t"
527
                "paddw %%mm5, %%mm6             \n\t"
528
                "movd %%mm6, %0                 \n\t"
529
                "andl $0xFFFF, %0               \n\t"
530 084c726b Michael Niedermayer
                : "=&r" (sum), "+r" (index)
531 053dea12 Aurelien Jacobs
                : "r" (pix - index), "r" ((long)line_size)
532 084c726b Michael Niedermayer
        );
533
534
        return sum;
535
}
536 764ef400 Mike Melanson
#endif //CONFIG_ENCODERS
537 084c726b Michael Niedermayer
538 11f18faf Michael Niedermayer
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
539 053dea12 Aurelien Jacobs
    long i=0;
540 11f18faf Michael Niedermayer
    asm volatile(
541 bb270c08 Diego Biurrun
        "1:                             \n\t"
542
        "movq  (%1, %0), %%mm0          \n\t"
543
        "movq  (%2, %0), %%mm1          \n\t"
544
        "paddb %%mm0, %%mm1             \n\t"
545
        "movq %%mm1, (%2, %0)           \n\t"
546
        "movq 8(%1, %0), %%mm0          \n\t"
547
        "movq 8(%2, %0), %%mm1          \n\t"
548
        "paddb %%mm0, %%mm1             \n\t"
549
        "movq %%mm1, 8(%2, %0)          \n\t"
550
        "add $16, %0                    \n\t"
551
        "cmp %3, %0                     \n\t"
552
        " jb 1b                         \n\t"
553 11f18faf Michael Niedermayer
        : "+r" (i)
554 053dea12 Aurelien Jacobs
        : "r"(src), "r"(dst), "r"((long)w-15)
555 11f18faf Michael Niedermayer
    );
556
    for(; i<w; i++)
557
        dst[i+0] += src[i+0];
558
}
559
560 3615e2be Michael Niedermayer
#define H263_LOOP_FILTER \
561 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7              \n\t"\
562
        "movq  %0, %%mm0                \n\t"\
563
        "movq  %0, %%mm1                \n\t"\
564
        "movq  %3, %%mm2                \n\t"\
565
        "movq  %3, %%mm3                \n\t"\
566
        "punpcklbw %%mm7, %%mm0         \n\t"\
567
        "punpckhbw %%mm7, %%mm1         \n\t"\
568
        "punpcklbw %%mm7, %%mm2         \n\t"\
569
        "punpckhbw %%mm7, %%mm3         \n\t"\
570
        "psubw %%mm2, %%mm0             \n\t"\
571
        "psubw %%mm3, %%mm1             \n\t"\
572
        "movq  %1, %%mm2                \n\t"\
573
        "movq  %1, %%mm3                \n\t"\
574
        "movq  %2, %%mm4                \n\t"\
575
        "movq  %2, %%mm5                \n\t"\
576
        "punpcklbw %%mm7, %%mm2         \n\t"\
577
        "punpckhbw %%mm7, %%mm3         \n\t"\
578
        "punpcklbw %%mm7, %%mm4         \n\t"\
579
        "punpckhbw %%mm7, %%mm5         \n\t"\
580
        "psubw %%mm2, %%mm4             \n\t"\
581
        "psubw %%mm3, %%mm5             \n\t"\
582
        "psllw $2, %%mm4                \n\t"\
583
        "psllw $2, %%mm5                \n\t"\
584
        "paddw %%mm0, %%mm4             \n\t"\
585
        "paddw %%mm1, %%mm5             \n\t"\
586
        "pxor %%mm6, %%mm6              \n\t"\
587
        "pcmpgtw %%mm4, %%mm6           \n\t"\
588
        "pcmpgtw %%mm5, %%mm7           \n\t"\
589
        "pxor %%mm6, %%mm4              \n\t"\
590
        "pxor %%mm7, %%mm5              \n\t"\
591
        "psubw %%mm6, %%mm4             \n\t"\
592
        "psubw %%mm7, %%mm5             \n\t"\
593
        "psrlw $3, %%mm4                \n\t"\
594
        "psrlw $3, %%mm5                \n\t"\
595
        "packuswb %%mm5, %%mm4          \n\t"\
596
        "packsswb %%mm7, %%mm6          \n\t"\
597
        "pxor %%mm7, %%mm7              \n\t"\
598
        "movd %4, %%mm2                 \n\t"\
599
        "punpcklbw %%mm2, %%mm2         \n\t"\
600
        "punpcklbw %%mm2, %%mm2         \n\t"\
601
        "punpcklbw %%mm2, %%mm2         \n\t"\
602
        "psubusb %%mm4, %%mm2           \n\t"\
603
        "movq %%mm2, %%mm3              \n\t"\
604
        "psubusb %%mm4, %%mm3           \n\t"\
605
        "psubb %%mm3, %%mm2             \n\t"\
606
        "movq %1, %%mm3                 \n\t"\
607
        "movq %2, %%mm4                 \n\t"\
608
        "pxor %%mm6, %%mm3              \n\t"\
609
        "pxor %%mm6, %%mm4              \n\t"\
610
        "paddusb %%mm2, %%mm3           \n\t"\
611
        "psubusb %%mm2, %%mm4           \n\t"\
612
        "pxor %%mm6, %%mm3              \n\t"\
613
        "pxor %%mm6, %%mm4              \n\t"\
614
        "paddusb %%mm2, %%mm2           \n\t"\
615
        "packsswb %%mm1, %%mm0          \n\t"\
616
        "pcmpgtb %%mm0, %%mm7           \n\t"\
617
        "pxor %%mm7, %%mm0              \n\t"\
618
        "psubb %%mm7, %%mm0             \n\t"\
619
        "movq %%mm0, %%mm1              \n\t"\
620
        "psubusb %%mm2, %%mm0           \n\t"\
621
        "psubb %%mm0, %%mm1             \n\t"\
622
        "pand %5, %%mm1                 \n\t"\
623
        "psrlw $2, %%mm1                \n\t"\
624
        "pxor %%mm7, %%mm1              \n\t"\
625
        "psubb %%mm7, %%mm1             \n\t"\
626
        "movq %0, %%mm5                 \n\t"\
627
        "movq %3, %%mm6                 \n\t"\
628
        "psubb %%mm1, %%mm5             \n\t"\
629
        "paddb %%mm1, %%mm6             \n\t"
630 3615e2be Michael Niedermayer
631 359f98de Michael Niedermayer
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
632 73f51a4d Aurelien Jacobs
    if(ENABLE_ANY_H263) {
633 359f98de Michael Niedermayer
    const int strength= ff_h263_loop_filter_strength[qscale];
634
635
    asm volatile(
636 115329f1 Diego Biurrun
637 3615e2be Michael Niedermayer
        H263_LOOP_FILTER
638 115329f1 Diego Biurrun
639 bb270c08 Diego Biurrun
        "movq %%mm3, %1                 \n\t"
640
        "movq %%mm4, %2                 \n\t"
641
        "movq %%mm5, %0                 \n\t"
642
        "movq %%mm6, %3                 \n\t"
643 359f98de Michael Niedermayer
        : "+m" (*(uint64_t*)(src - 2*stride)),
644
          "+m" (*(uint64_t*)(src - 1*stride)),
645
          "+m" (*(uint64_t*)(src + 0*stride)),
646
          "+m" (*(uint64_t*)(src + 1*stride))
647
        : "g" (2*strength), "m"(ff_pb_FC)
648
    );
649 73f51a4d Aurelien Jacobs
    }
650 359f98de Michael Niedermayer
}
651
652 3615e2be Michael Niedermayer
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
653
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
654 bb270c08 Diego Biurrun
        "movd  %4, %%mm0                \n\t"
655
        "movd  %5, %%mm1                \n\t"
656
        "movd  %6, %%mm2                \n\t"
657
        "movd  %7, %%mm3                \n\t"
658
        "punpcklbw %%mm1, %%mm0         \n\t"
659
        "punpcklbw %%mm3, %%mm2         \n\t"
660
        "movq %%mm0, %%mm1              \n\t"
661
        "punpcklwd %%mm2, %%mm0         \n\t"
662
        "punpckhwd %%mm2, %%mm1         \n\t"
663
        "movd  %%mm0, %0                \n\t"
664
        "punpckhdq %%mm0, %%mm0         \n\t"
665
        "movd  %%mm0, %1                \n\t"
666
        "movd  %%mm1, %2                \n\t"
667
        "punpckhdq %%mm1, %%mm1         \n\t"
668
        "movd  %%mm1, %3                \n\t"
669 115329f1 Diego Biurrun
670 3615e2be Michael Niedermayer
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
671
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
672
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
673
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
674
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
675
           "m" (*(uint32_t*)(src + 1*src_stride)),
676
           "m" (*(uint32_t*)(src + 2*src_stride)),
677
           "m" (*(uint32_t*)(src + 3*src_stride))
678
    );
679
}
680
681
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
682 73f51a4d Aurelien Jacobs
    if(ENABLE_ANY_H263) {
683 3615e2be Michael Niedermayer
    const int strength= ff_h263_loop_filter_strength[qscale];
684 27215c6b Reimar Döffinger
    DECLARE_ALIGNED(8, uint64_t, temp[4]);
685 3615e2be Michael Niedermayer
    uint8_t *btemp= (uint8_t*)temp;
686 115329f1 Diego Biurrun
687 3615e2be Michael Niedermayer
    src -= 2;
688
689
    transpose4x4(btemp  , src           , 8, stride);
690
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
691
    asm volatile(
692
        H263_LOOP_FILTER // 5 3 4 6
693 115329f1 Diego Biurrun
694 3615e2be Michael Niedermayer
        : "+m" (temp[0]),
695
          "+m" (temp[1]),
696
          "+m" (temp[2]),
697
          "+m" (temp[3])
698
        : "g" (2*strength), "m"(ff_pb_FC)
699
    );
700
701
    asm volatile(
702 bb270c08 Diego Biurrun
        "movq %%mm5, %%mm1              \n\t"
703
        "movq %%mm4, %%mm0              \n\t"
704
        "punpcklbw %%mm3, %%mm5         \n\t"
705
        "punpcklbw %%mm6, %%mm4         \n\t"
706
        "punpckhbw %%mm3, %%mm1         \n\t"
707
        "punpckhbw %%mm6, %%mm0         \n\t"
708
        "movq %%mm5, %%mm3              \n\t"
709
        "movq %%mm1, %%mm6              \n\t"
710
        "punpcklwd %%mm4, %%mm5         \n\t"
711
        "punpcklwd %%mm0, %%mm1         \n\t"
712
        "punpckhwd %%mm4, %%mm3         \n\t"
713
        "punpckhwd %%mm0, %%mm6         \n\t"
714
        "movd %%mm5, (%0)               \n\t"
715
        "punpckhdq %%mm5, %%mm5         \n\t"
716
        "movd %%mm5, (%0,%2)            \n\t"
717
        "movd %%mm3, (%0,%2,2)          \n\t"
718
        "punpckhdq %%mm3, %%mm3         \n\t"
719
        "movd %%mm3, (%0,%3)            \n\t"
720
        "movd %%mm1, (%1)               \n\t"
721
        "punpckhdq %%mm1, %%mm1         \n\t"
722
        "movd %%mm1, (%1,%2)            \n\t"
723
        "movd %%mm6, (%1,%2,2)          \n\t"
724
        "punpckhdq %%mm6, %%mm6         \n\t"
725
        "movd %%mm6, (%1,%3)            \n\t"
726 4d9ae03b Martin Drab
        :: "r" (src),
727
           "r" (src + 4*stride),
728
           "r" ((long)   stride ),
729
           "r" ((long)(3*stride))
730 3615e2be Michael Niedermayer
    );
731 73f51a4d Aurelien Jacobs
    }
732 3615e2be Michael Niedermayer
}
733
734 764ef400 Mike Melanson
#ifdef CONFIG_ENCODERS
735 2a006cd3 Felix von Leitner
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
736
    int tmp;
737
  asm volatile (
738
      "movl $16,%%ecx\n"
739
      "pxor %%mm0,%%mm0\n"
740
      "pxor %%mm7,%%mm7\n"
741
      "1:\n"
742 bb270c08 Diego Biurrun
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
743
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
744 2a006cd3 Felix von Leitner
745 bb270c08 Diego Biurrun
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
746 2a006cd3 Felix von Leitner
747 bb270c08 Diego Biurrun
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
748
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
749 2a006cd3 Felix von Leitner
750 bb270c08 Diego Biurrun
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
751
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
752
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
753 2a006cd3 Felix von Leitner
754 bb270c08 Diego Biurrun
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
755
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
756 2a006cd3 Felix von Leitner
757
      "pmaddwd %%mm3,%%mm3\n"
758
      "pmaddwd %%mm4,%%mm4\n"
759
760 bb270c08 Diego Biurrun
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
761
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
762 2a006cd3 Felix von Leitner
      "paddd %%mm3,%%mm4\n"
763
      "paddd %%mm2,%%mm7\n"
764
765 053dea12 Aurelien Jacobs
      "add %2, %0\n"
766 2a006cd3 Felix von Leitner
      "paddd %%mm4,%%mm7\n"
767
      "dec %%ecx\n"
768
      "jnz 1b\n"
769
770
      "movq %%mm7,%%mm1\n"
771 bb270c08 Diego Biurrun
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
772 2a006cd3 Felix von Leitner
      "paddd %%mm7,%%mm1\n"
773
      "movd %%mm1,%1\n"
774 053dea12 Aurelien Jacobs
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
775 2a006cd3 Felix von Leitner
    return tmp;
776
}
777
778 1ec4df0f Michael Niedermayer
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
779
    int tmp;
780
  asm volatile (
781
      "movl %4,%%ecx\n"
782 5693c083 Loren Merritt
      "shr $1,%%ecx\n"
783 bb270c08 Diego Biurrun
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
784
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
785 1ec4df0f Michael Niedermayer
      "1:\n"
786 bb270c08 Diego Biurrun
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
787
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
788
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
789
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
790 1ec4df0f Michael Niedermayer
791 5693c083 Loren Merritt
      /* todo: mm1-mm2, mm3-mm4 */
792 52b541ad Vitor Sessak
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
793 5693c083 Loren Merritt
      /*       OR the results to get absolute difference */
794 1ec4df0f Michael Niedermayer
      "movq %%mm1,%%mm5\n"
795 5693c083 Loren Merritt
      "movq %%mm3,%%mm6\n"
796 1ec4df0f Michael Niedermayer
      "psubusb %%mm2,%%mm1\n"
797 5693c083 Loren Merritt
      "psubusb %%mm4,%%mm3\n"
798 1ec4df0f Michael Niedermayer
      "psubusb %%mm5,%%mm2\n"
799 5693c083 Loren Merritt
      "psubusb %%mm6,%%mm4\n"
800 1ec4df0f Michael Niedermayer
801
      "por %%mm1,%%mm2\n"
802 5693c083 Loren Merritt
      "por %%mm3,%%mm4\n"
803 1ec4df0f Michael Niedermayer
804 5693c083 Loren Merritt
      /* now convert to 16-bit vectors so we can square them */
805 1ec4df0f Michael Niedermayer
      "movq %%mm2,%%mm1\n"
806 5693c083 Loren Merritt
      "movq %%mm4,%%mm3\n"
807 1ec4df0f Michael Niedermayer
808
      "punpckhbw %%mm0,%%mm2\n"
809 5693c083 Loren Merritt
      "punpckhbw %%mm0,%%mm4\n"
810 bb270c08 Diego Biurrun
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
811
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
812 1ec4df0f Michael Niedermayer
813
      "pmaddwd %%mm2,%%mm2\n"
814 5693c083 Loren Merritt
      "pmaddwd %%mm4,%%mm4\n"
815 1ec4df0f Michael Niedermayer
      "pmaddwd %%mm1,%%mm1\n"
816 5693c083 Loren Merritt
      "pmaddwd %%mm3,%%mm3\n"
817 1ec4df0f Michael Niedermayer
818 bb270c08 Diego Biurrun
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
819
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
820 1ec4df0f Michael Niedermayer
821
      "paddd %%mm2,%%mm1\n"
822 5693c083 Loren Merritt
      "paddd %%mm4,%%mm3\n"
823 1ec4df0f Michael Niedermayer
      "paddd %%mm1,%%mm7\n"
824 5693c083 Loren Merritt
      "paddd %%mm3,%%mm7\n"
825 1ec4df0f Michael Niedermayer
826
      "decl %%ecx\n"
827
      "jnz 1b\n"
828
829
      "movq %%mm7,%%mm1\n"
830 bb270c08 Diego Biurrun
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
831 1ec4df0f Michael Niedermayer
      "paddd %%mm7,%%mm1\n"
832
      "movd %%mm1,%2\n"
833 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
834 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
835 1ec4df0f Michael Niedermayer
      : "%ecx");
836
    return tmp;
837
}
838
839 bb198e19 Michael Niedermayer
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
840 2a006cd3 Felix von Leitner
    int tmp;
841
  asm volatile (
842 bb198e19 Michael Niedermayer
      "movl %4,%%ecx\n"
843 bb270c08 Diego Biurrun
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
844
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
845 2a006cd3 Felix von Leitner
      "1:\n"
846 bb270c08 Diego Biurrun
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
847
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
848
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
849
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
850 2a006cd3 Felix von Leitner
851
      /* todo: mm1-mm2, mm3-mm4 */
852 52b541ad Vitor Sessak
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
853 2a006cd3 Felix von Leitner
      /*       OR the results to get absolute difference */
854
      "movq %%mm1,%%mm5\n"
855
      "movq %%mm3,%%mm6\n"
856
      "psubusb %%mm2,%%mm1\n"
857
      "psubusb %%mm4,%%mm3\n"
858
      "psubusb %%mm5,%%mm2\n"
859
      "psubusb %%mm6,%%mm4\n"
860
861
      "por %%mm1,%%mm2\n"
862
      "por %%mm3,%%mm4\n"
863
864
      /* now convert to 16-bit vectors so we can square them */
865
      "movq %%mm2,%%mm1\n"
866
      "movq %%mm4,%%mm3\n"
867
868
      "punpckhbw %%mm0,%%mm2\n"
869
      "punpckhbw %%mm0,%%mm4\n"
870 bb270c08 Diego Biurrun
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
871
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
872 2a006cd3 Felix von Leitner
873
      "pmaddwd %%mm2,%%mm2\n"
874
      "pmaddwd %%mm4,%%mm4\n"
875
      "pmaddwd %%mm1,%%mm1\n"
876
      "pmaddwd %%mm3,%%mm3\n"
877
878 053dea12 Aurelien Jacobs
      "add %3,%0\n"
879
      "add %3,%1\n"
880 2a006cd3 Felix von Leitner
881
      "paddd %%mm2,%%mm1\n"
882
      "paddd %%mm4,%%mm3\n"
883
      "paddd %%mm1,%%mm7\n"
884
      "paddd %%mm3,%%mm7\n"
885
886
      "decl %%ecx\n"
887
      "jnz 1b\n"
888
889
      "movq %%mm7,%%mm1\n"
890 bb270c08 Diego Biurrun
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
891 2a006cd3 Felix von Leitner
      "paddd %%mm7,%%mm1\n"
892
      "movd %%mm1,%2\n"
893 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
894 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
895 bb198e19 Michael Niedermayer
      : "%ecx");
896 2a006cd3 Felix von Leitner
    return tmp;
897
}
898
899 5693c083 Loren Merritt
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
900
    int tmp;
901
  asm volatile (
902
      "shr $1,%2\n"
903 bb270c08 Diego Biurrun
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
904
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
905 5693c083 Loren Merritt
      "1:\n"
906 bb270c08 Diego Biurrun
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
907
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
908
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
909
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
910 5693c083 Loren Merritt
911
      /* todo: mm1-mm2, mm3-mm4 */
912 52b541ad Vitor Sessak
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
913 5693c083 Loren Merritt
      /*       OR the results to get absolute difference */
914
      "movdqa %%xmm1,%%xmm5\n"
915
      "movdqa %%xmm3,%%xmm6\n"
916
      "psubusb %%xmm2,%%xmm1\n"
917
      "psubusb %%xmm4,%%xmm3\n"
918
      "psubusb %%xmm5,%%xmm2\n"
919
      "psubusb %%xmm6,%%xmm4\n"
920
921
      "por %%xmm1,%%xmm2\n"
922
      "por %%xmm3,%%xmm4\n"
923
924
      /* now convert to 16-bit vectors so we can square them */
925
      "movdqa %%xmm2,%%xmm1\n"
926
      "movdqa %%xmm4,%%xmm3\n"
927
928
      "punpckhbw %%xmm0,%%xmm2\n"
929
      "punpckhbw %%xmm0,%%xmm4\n"
930 bb270c08 Diego Biurrun
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
931
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
932 5693c083 Loren Merritt
933
      "pmaddwd %%xmm2,%%xmm2\n"
934
      "pmaddwd %%xmm4,%%xmm4\n"
935
      "pmaddwd %%xmm1,%%xmm1\n"
936
      "pmaddwd %%xmm3,%%xmm3\n"
937
938 bb270c08 Diego Biurrun
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
939
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
940 5693c083 Loren Merritt
941
      "paddd %%xmm2,%%xmm1\n"
942
      "paddd %%xmm4,%%xmm3\n"
943
      "paddd %%xmm1,%%xmm7\n"
944
      "paddd %%xmm3,%%xmm7\n"
945
946
      "decl %2\n"
947
      "jnz 1b\n"
948
949
      "movdqa %%xmm7,%%xmm1\n"
950 bb270c08 Diego Biurrun
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
951 5693c083 Loren Merritt
      "paddd %%xmm1,%%xmm7\n"
952
      "movdqa %%xmm7,%%xmm1\n"
953 bb270c08 Diego Biurrun
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
954 5693c083 Loren Merritt
      "paddd %%xmm1,%%xmm7\n"
955
      "movd %%xmm7,%3\n"
956 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
957 5693c083 Loren Merritt
      : "r" ((long)line_size));
958
    return tmp;
959
}
960
961 1ec4df0f Michael Niedermayer
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
962
    int tmp;
963
  asm volatile (
964
      "movl %3,%%ecx\n"
965
      "pxor %%mm7,%%mm7\n"
966
      "pxor %%mm6,%%mm6\n"
967 115329f1 Diego Biurrun
968 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
969
      "movq %%mm0, %%mm1\n"
970
      "psllq $8, %%mm0\n"
971
      "psrlq $8, %%mm1\n"
972
      "psrlq $8, %%mm0\n"
973
      "movq %%mm0, %%mm2\n"
974
      "movq %%mm1, %%mm3\n"
975
      "punpcklbw %%mm7,%%mm0\n"
976
      "punpcklbw %%mm7,%%mm1\n"
977
      "punpckhbw %%mm7,%%mm2\n"
978
      "punpckhbw %%mm7,%%mm3\n"
979
      "psubw %%mm1, %%mm0\n"
980
      "psubw %%mm3, %%mm2\n"
981 115329f1 Diego Biurrun
982 053dea12 Aurelien Jacobs
      "add %2,%0\n"
983 115329f1 Diego Biurrun
984 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
985
      "movq %%mm4, %%mm1\n"
986
      "psllq $8, %%mm4\n"
987
      "psrlq $8, %%mm1\n"
988
      "psrlq $8, %%mm4\n"
989
      "movq %%mm4, %%mm5\n"
990
      "movq %%mm1, %%mm3\n"
991
      "punpcklbw %%mm7,%%mm4\n"
992
      "punpcklbw %%mm7,%%mm1\n"
993
      "punpckhbw %%mm7,%%mm5\n"
994
      "punpckhbw %%mm7,%%mm3\n"
995
      "psubw %%mm1, %%mm4\n"
996
      "psubw %%mm3, %%mm5\n"
997
      "psubw %%mm4, %%mm0\n"
998
      "psubw %%mm5, %%mm2\n"
999
      "pxor %%mm3, %%mm3\n"
1000
      "pxor %%mm1, %%mm1\n"
1001
      "pcmpgtw %%mm0, %%mm3\n\t"
1002
      "pcmpgtw %%mm2, %%mm1\n\t"
1003
      "pxor %%mm3, %%mm0\n"
1004
      "pxor %%mm1, %%mm2\n"
1005 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
1006 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1007
      "paddw %%mm0, %%mm2\n"
1008
      "paddw %%mm2, %%mm6\n"
1009
1010 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1011 1ec4df0f Michael Niedermayer
      "1:\n"
1012 115329f1 Diego Biurrun
1013 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
1014
      "movq %%mm0, %%mm1\n"
1015
      "psllq $8, %%mm0\n"
1016
      "psrlq $8, %%mm1\n"
1017
      "psrlq $8, %%mm0\n"
1018
      "movq %%mm0, %%mm2\n"
1019
      "movq %%mm1, %%mm3\n"
1020
      "punpcklbw %%mm7,%%mm0\n"
1021
      "punpcklbw %%mm7,%%mm1\n"
1022
      "punpckhbw %%mm7,%%mm2\n"
1023
      "punpckhbw %%mm7,%%mm3\n"
1024
      "psubw %%mm1, %%mm0\n"
1025
      "psubw %%mm3, %%mm2\n"
1026
      "psubw %%mm0, %%mm4\n"
1027
      "psubw %%mm2, %%mm5\n"
1028
      "pxor %%mm3, %%mm3\n"
1029
      "pxor %%mm1, %%mm1\n"
1030
      "pcmpgtw %%mm4, %%mm3\n\t"
1031
      "pcmpgtw %%mm5, %%mm1\n\t"
1032
      "pxor %%mm3, %%mm4\n"
1033
      "pxor %%mm1, %%mm5\n"
1034 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm4\n"
1035 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm5\n"
1036
      "paddw %%mm4, %%mm5\n"
1037
      "paddw %%mm5, %%mm6\n"
1038 115329f1 Diego Biurrun
1039 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1040 115329f1 Diego Biurrun
1041 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
1042
      "movq %%mm4, %%mm1\n"
1043
      "psllq $8, %%mm4\n"
1044
      "psrlq $8, %%mm1\n"
1045
      "psrlq $8, %%mm4\n"
1046
      "movq %%mm4, %%mm5\n"
1047
      "movq %%mm1, %%mm3\n"
1048
      "punpcklbw %%mm7,%%mm4\n"
1049
      "punpcklbw %%mm7,%%mm1\n"
1050
      "punpckhbw %%mm7,%%mm5\n"
1051
      "punpckhbw %%mm7,%%mm3\n"
1052
      "psubw %%mm1, %%mm4\n"
1053
      "psubw %%mm3, %%mm5\n"
1054
      "psubw %%mm4, %%mm0\n"
1055
      "psubw %%mm5, %%mm2\n"
1056
      "pxor %%mm3, %%mm3\n"
1057
      "pxor %%mm1, %%mm1\n"
1058
      "pcmpgtw %%mm0, %%mm3\n\t"
1059
      "pcmpgtw %%mm2, %%mm1\n\t"
1060
      "pxor %%mm3, %%mm0\n"
1061
      "pxor %%mm1, %%mm2\n"
1062 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
1063 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1064
      "paddw %%mm0, %%mm2\n"
1065
      "paddw %%mm2, %%mm6\n"
1066
1067 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1068 1ec4df0f Michael Niedermayer
      "subl $2, %%ecx\n"
1069
      " jnz 1b\n"
1070
1071
      "movq %%mm6, %%mm0\n"
1072
      "punpcklwd %%mm7,%%mm0\n"
1073
      "punpckhwd %%mm7,%%mm6\n"
1074
      "paddd %%mm0, %%mm6\n"
1075 115329f1 Diego Biurrun
1076 1ec4df0f Michael Niedermayer
      "movq %%mm6,%%mm0\n"
1077
      "psrlq $32, %%mm6\n"
1078
      "paddd %%mm6,%%mm0\n"
1079
      "movd %%mm0,%1\n"
1080 115329f1 Diego Biurrun
      : "+r" (pix1), "=r"(tmp)
1081 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "g" (h-2)
1082 1ec4df0f Michael Niedermayer
      : "%ecx");
1083
      return tmp;
1084
}
1085
1086
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1087
    int tmp;
1088
    uint8_t * pix= pix1;
1089
  asm volatile (
1090
      "movl %3,%%ecx\n"
1091
      "pxor %%mm7,%%mm7\n"
1092
      "pxor %%mm6,%%mm6\n"
1093 115329f1 Diego Biurrun
1094 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
1095
      "movq 1(%0),%%mm1\n"
1096
      "movq %%mm0, %%mm2\n"
1097
      "movq %%mm1, %%mm3\n"
1098
      "punpcklbw %%mm7,%%mm0\n"
1099
      "punpcklbw %%mm7,%%mm1\n"
1100
      "punpckhbw %%mm7,%%mm2\n"
1101
      "punpckhbw %%mm7,%%mm3\n"
1102
      "psubw %%mm1, %%mm0\n"
1103
      "psubw %%mm3, %%mm2\n"
1104 115329f1 Diego Biurrun
1105 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1106 115329f1 Diego Biurrun
1107 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
1108
      "movq 1(%0),%%mm1\n"
1109
      "movq %%mm4, %%mm5\n"
1110
      "movq %%mm1, %%mm3\n"
1111
      "punpcklbw %%mm7,%%mm4\n"
1112
      "punpcklbw %%mm7,%%mm1\n"
1113
      "punpckhbw %%mm7,%%mm5\n"
1114
      "punpckhbw %%mm7,%%mm3\n"
1115
      "psubw %%mm1, %%mm4\n"
1116
      "psubw %%mm3, %%mm5\n"
1117
      "psubw %%mm4, %%mm0\n"
1118
      "psubw %%mm5, %%mm2\n"
1119
      "pxor %%mm3, %%mm3\n"
1120
      "pxor %%mm1, %%mm1\n"
1121
      "pcmpgtw %%mm0, %%mm3\n\t"
1122
      "pcmpgtw %%mm2, %%mm1\n\t"
1123
      "pxor %%mm3, %%mm0\n"
1124
      "pxor %%mm1, %%mm2\n"
1125 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
1126 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1127
      "paddw %%mm0, %%mm2\n"
1128
      "paddw %%mm2, %%mm6\n"
1129
1130 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1131 1ec4df0f Michael Niedermayer
      "1:\n"
1132 115329f1 Diego Biurrun
1133 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm0\n"
1134
      "movq 1(%0),%%mm1\n"
1135
      "movq %%mm0, %%mm2\n"
1136
      "movq %%mm1, %%mm3\n"
1137
      "punpcklbw %%mm7,%%mm0\n"
1138
      "punpcklbw %%mm7,%%mm1\n"
1139
      "punpckhbw %%mm7,%%mm2\n"
1140
      "punpckhbw %%mm7,%%mm3\n"
1141
      "psubw %%mm1, %%mm0\n"
1142
      "psubw %%mm3, %%mm2\n"
1143
      "psubw %%mm0, %%mm4\n"
1144
      "psubw %%mm2, %%mm5\n"
1145
      "pxor %%mm3, %%mm3\n"
1146
      "pxor %%mm1, %%mm1\n"
1147
      "pcmpgtw %%mm4, %%mm3\n\t"
1148
      "pcmpgtw %%mm5, %%mm1\n\t"
1149
      "pxor %%mm3, %%mm4\n"
1150
      "pxor %%mm1, %%mm5\n"
1151
      "psubw %%mm3, %%mm4\n"
1152
      "psubw %%mm1, %%mm5\n"
1153
      "paddw %%mm4, %%mm5\n"
1154
      "paddw %%mm5, %%mm6\n"
1155 115329f1 Diego Biurrun
1156 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1157 115329f1 Diego Biurrun
1158 1ec4df0f Michael Niedermayer
      "movq (%0),%%mm4\n"
1159
      "movq 1(%0),%%mm1\n"
1160
      "movq %%mm4, %%mm5\n"
1161
      "movq %%mm1, %%mm3\n"
1162
      "punpcklbw %%mm7,%%mm4\n"
1163
      "punpcklbw %%mm7,%%mm1\n"
1164
      "punpckhbw %%mm7,%%mm5\n"
1165
      "punpckhbw %%mm7,%%mm3\n"
1166
      "psubw %%mm1, %%mm4\n"
1167
      "psubw %%mm3, %%mm5\n"
1168
      "psubw %%mm4, %%mm0\n"
1169
      "psubw %%mm5, %%mm2\n"
1170
      "pxor %%mm3, %%mm3\n"
1171
      "pxor %%mm1, %%mm1\n"
1172
      "pcmpgtw %%mm0, %%mm3\n\t"
1173
      "pcmpgtw %%mm2, %%mm1\n\t"
1174
      "pxor %%mm3, %%mm0\n"
1175
      "pxor %%mm1, %%mm2\n"
1176 115329f1 Diego Biurrun
      "psubw %%mm3, %%mm0\n"
1177 1ec4df0f Michael Niedermayer
      "psubw %%mm1, %%mm2\n"
1178
      "paddw %%mm0, %%mm2\n"
1179
      "paddw %%mm2, %%mm6\n"
1180
1181 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1182 1ec4df0f Michael Niedermayer
      "subl $2, %%ecx\n"
1183
      " jnz 1b\n"
1184
1185
      "movq %%mm6, %%mm0\n"
1186
      "punpcklwd %%mm7,%%mm0\n"
1187
      "punpckhwd %%mm7,%%mm6\n"
1188
      "paddd %%mm0, %%mm6\n"
1189 115329f1 Diego Biurrun
1190 1ec4df0f Michael Niedermayer
      "movq %%mm6,%%mm0\n"
1191
      "psrlq $32, %%mm6\n"
1192
      "paddd %%mm6,%%mm0\n"
1193
      "movd %%mm0,%1\n"
1194 115329f1 Diego Biurrun
      : "+r" (pix1), "=r"(tmp)
1195 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "g" (h-2)
1196 1ec4df0f Michael Niedermayer
      : "%ecx");
1197
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1198
}
1199
1200 79396ac6 Måns Rullgård
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1201
    MpegEncContext *c = p;
1202 ea15df80 Loren Merritt
    int score1, score2;
1203
1204
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1205
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1206
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1207 1ec4df0f Michael Niedermayer
1208 c26abfa5 Diego Biurrun
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1209
    else  return score1 + FFABS(score2)*8;
1210 1ec4df0f Michael Niedermayer
}
1211
1212 79396ac6 Måns Rullgård
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1213
    MpegEncContext *c = p;
1214 1ec4df0f Michael Niedermayer
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1215
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1216
1217 c26abfa5 Diego Biurrun
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1218
    else  return score1 + FFABS(score2)*8;
1219 1ec4df0f Michael Niedermayer
}
1220
1221 622348f9 Michael Niedermayer
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1222
    int tmp;
1223 115329f1 Diego Biurrun
1224 622348f9 Michael Niedermayer
    assert( (((int)pix) & 7) == 0);
1225
    assert((line_size &7) ==0);
1226 115329f1 Diego Biurrun
1227 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1228
      "movq (%0), %%mm2\n"\
1229
      "movq 8(%0), %%mm3\n"\
1230 053dea12 Aurelien Jacobs
      "add %2,%0\n"\
1231 622348f9 Michael Niedermayer
      "movq %%mm2, " #out0 "\n"\
1232
      "movq %%mm3, " #out1 "\n"\
1233
      "psubusb " #in0 ", %%mm2\n"\
1234
      "psubusb " #in1 ", %%mm3\n"\
1235
      "psubusb " #out0 ", " #in0 "\n"\
1236
      "psubusb " #out1 ", " #in1 "\n"\
1237
      "por %%mm2, " #in0 "\n"\
1238
      "por %%mm3, " #in1 "\n"\
1239
      "movq " #in0 ", %%mm2\n"\
1240
      "movq " #in1 ", %%mm3\n"\
1241
      "punpcklbw %%mm7, " #in0 "\n"\
1242
      "punpcklbw %%mm7, " #in1 "\n"\
1243
      "punpckhbw %%mm7, %%mm2\n"\
1244
      "punpckhbw %%mm7, %%mm3\n"\
1245
      "paddw " #in1 ", " #in0 "\n"\
1246
      "paddw %%mm3, %%mm2\n"\
1247
      "paddw %%mm2, " #in0 "\n"\
1248
      "paddw " #in0 ", %%mm6\n"
1249
1250 115329f1 Diego Biurrun
1251 622348f9 Michael Niedermayer
  asm volatile (
1252
      "movl %3,%%ecx\n"
1253
      "pxor %%mm6,%%mm6\n"
1254
      "pxor %%mm7,%%mm7\n"
1255
      "movq (%0),%%mm0\n"
1256
      "movq 8(%0),%%mm1\n"
1257 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1258 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1259
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1260
      "1:\n"
1261 115329f1 Diego Biurrun
1262 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1263 115329f1 Diego Biurrun
1264 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1265 115329f1 Diego Biurrun
1266 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1267
      "jnz 1b\n"
1268
1269
      "movq %%mm6,%%mm0\n"
1270
      "psrlq $32, %%mm6\n"
1271
      "paddw %%mm6,%%mm0\n"
1272
      "movq %%mm0,%%mm6\n"
1273
      "psrlq $16, %%mm0\n"
1274
      "paddw %%mm6,%%mm0\n"
1275
      "movd %%mm0,%1\n"
1276 115329f1 Diego Biurrun
      : "+r" (pix), "=r"(tmp)
1277 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1278 622348f9 Michael Niedermayer
      : "%ecx");
1279
    return tmp & 0xFFFF;
1280
}
1281
#undef SUM
1282
1283
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1284
    int tmp;
1285 115329f1 Diego Biurrun
1286 622348f9 Michael Niedermayer
    assert( (((int)pix) & 7) == 0);
1287
    assert((line_size &7) ==0);
1288 115329f1 Diego Biurrun
1289 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1290
      "movq (%0), " #out0 "\n"\
1291
      "movq 8(%0), " #out1 "\n"\
1292 053dea12 Aurelien Jacobs
      "add %2,%0\n"\
1293 622348f9 Michael Niedermayer
      "psadbw " #out0 ", " #in0 "\n"\
1294
      "psadbw " #out1 ", " #in1 "\n"\
1295
      "paddw " #in1 ", " #in0 "\n"\
1296
      "paddw " #in0 ", %%mm6\n"
1297
1298
  asm volatile (
1299
      "movl %3,%%ecx\n"
1300
      "pxor %%mm6,%%mm6\n"
1301
      "pxor %%mm7,%%mm7\n"
1302
      "movq (%0),%%mm0\n"
1303
      "movq 8(%0),%%mm1\n"
1304 053dea12 Aurelien Jacobs
      "add %2,%0\n"
1305 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1306
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1307
      "1:\n"
1308 115329f1 Diego Biurrun
1309 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1310 115329f1 Diego Biurrun
1311 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1312 115329f1 Diego Biurrun
1313 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1314
      "jnz 1b\n"
1315
1316
      "movd %%mm6,%1\n"
1317 115329f1 Diego Biurrun
      : "+r" (pix), "=r"(tmp)
1318 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1319 622348f9 Michael Niedermayer
      : "%ecx");
1320
    return tmp;
1321
}
1322
#undef SUM
1323
1324
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1325
    int tmp;
1326 115329f1 Diego Biurrun
1327 622348f9 Michael Niedermayer
    assert( (((int)pix1) & 7) == 0);
1328
    assert( (((int)pix2) & 7) == 0);
1329
    assert((line_size &7) ==0);
1330 115329f1 Diego Biurrun
1331 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1332
      "movq (%0),%%mm2\n"\
1333
      "movq (%1)," #out0 "\n"\
1334
      "movq 8(%0),%%mm3\n"\
1335
      "movq 8(%1)," #out1 "\n"\
1336 053dea12 Aurelien Jacobs
      "add %3,%0\n"\
1337
      "add %3,%1\n"\
1338 622348f9 Michael Niedermayer
      "psubb " #out0 ", %%mm2\n"\
1339
      "psubb " #out1 ", %%mm3\n"\
1340
      "pxor %%mm7, %%mm2\n"\
1341
      "pxor %%mm7, %%mm3\n"\
1342
      "movq %%mm2, " #out0 "\n"\
1343
      "movq %%mm3, " #out1 "\n"\
1344
      "psubusb " #in0 ", %%mm2\n"\
1345
      "psubusb " #in1 ", %%mm3\n"\
1346
      "psubusb " #out0 ", " #in0 "\n"\
1347
      "psubusb " #out1 ", " #in1 "\n"\
1348
      "por %%mm2, " #in0 "\n"\
1349
      "por %%mm3, " #in1 "\n"\
1350
      "movq " #in0 ", %%mm2\n"\
1351
      "movq " #in1 ", %%mm3\n"\
1352
      "punpcklbw %%mm7, " #in0 "\n"\
1353
      "punpcklbw %%mm7, " #in1 "\n"\
1354
      "punpckhbw %%mm7, %%mm2\n"\
1355
      "punpckhbw %%mm7, %%mm3\n"\
1356
      "paddw " #in1 ", " #in0 "\n"\
1357
      "paddw %%mm3, %%mm2\n"\
1358
      "paddw %%mm2, " #in0 "\n"\
1359
      "paddw " #in0 ", %%mm6\n"
1360
1361 115329f1 Diego Biurrun
1362 622348f9 Michael Niedermayer
  asm volatile (
1363
      "movl %4,%%ecx\n"
1364
      "pxor %%mm6,%%mm6\n"
1365
      "pcmpeqw %%mm7,%%mm7\n"
1366
      "psllw $15, %%mm7\n"
1367
      "packsswb %%mm7, %%mm7\n"
1368
      "movq (%0),%%mm0\n"
1369
      "movq (%1),%%mm2\n"
1370
      "movq 8(%0),%%mm1\n"
1371
      "movq 8(%1),%%mm3\n"
1372 053dea12 Aurelien Jacobs
      "add %3,%0\n"
1373
      "add %3,%1\n"
1374 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1375
      "psubb %%mm2, %%mm0\n"
1376
      "psubb %%mm3, %%mm1\n"
1377
      "pxor %%mm7, %%mm0\n"
1378
      "pxor %%mm7, %%mm1\n"
1379
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1380
      "1:\n"
1381 115329f1 Diego Biurrun
1382 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1383 115329f1 Diego Biurrun
1384 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1385 115329f1 Diego Biurrun
1386 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1387
      "jnz 1b\n"
1388
1389
      "movq %%mm6,%%mm0\n"
1390
      "psrlq $32, %%mm6\n"
1391
      "paddw %%mm6,%%mm0\n"
1392
      "movq %%mm0,%%mm6\n"
1393
      "psrlq $16, %%mm0\n"
1394
      "paddw %%mm6,%%mm0\n"
1395
      "movd %%mm0,%2\n"
1396 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1397 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1398 622348f9 Michael Niedermayer
      : "%ecx");
1399
    return tmp & 0x7FFF;
1400
}
1401
#undef SUM
1402
1403
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1404
    int tmp;
1405 115329f1 Diego Biurrun
1406 622348f9 Michael Niedermayer
    assert( (((int)pix1) & 7) == 0);
1407
    assert( (((int)pix2) & 7) == 0);
1408
    assert((line_size &7) ==0);
1409 115329f1 Diego Biurrun
1410 622348f9 Michael Niedermayer
#define SUM(in0, in1, out0, out1) \
1411
      "movq (%0)," #out0 "\n"\
1412
      "movq (%1),%%mm2\n"\
1413
      "movq 8(%0)," #out1 "\n"\
1414
      "movq 8(%1),%%mm3\n"\
1415 053dea12 Aurelien Jacobs
      "add %3,%0\n"\
1416
      "add %3,%1\n"\
1417 622348f9 Michael Niedermayer
      "psubb %%mm2, " #out0 "\n"\
1418
      "psubb %%mm3, " #out1 "\n"\
1419
      "pxor %%mm7, " #out0 "\n"\
1420
      "pxor %%mm7, " #out1 "\n"\
1421
      "psadbw " #out0 ", " #in0 "\n"\
1422
      "psadbw " #out1 ", " #in1 "\n"\
1423
      "paddw " #in1 ", " #in0 "\n"\
1424
      "paddw " #in0 ", %%mm6\n"
1425
1426
  asm volatile (
1427
      "movl %4,%%ecx\n"
1428
      "pxor %%mm6,%%mm6\n"
1429
      "pcmpeqw %%mm7,%%mm7\n"
1430
      "psllw $15, %%mm7\n"
1431
      "packsswb %%mm7, %%mm7\n"
1432
      "movq (%0),%%mm0\n"
1433
      "movq (%1),%%mm2\n"
1434
      "movq 8(%0),%%mm1\n"
1435
      "movq 8(%1),%%mm3\n"
1436 053dea12 Aurelien Jacobs
      "add %3,%0\n"
1437
      "add %3,%1\n"
1438 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1439
      "psubb %%mm2, %%mm0\n"
1440
      "psubb %%mm3, %%mm1\n"
1441
      "pxor %%mm7, %%mm0\n"
1442
      "pxor %%mm7, %%mm1\n"
1443
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1444
      "1:\n"
1445 115329f1 Diego Biurrun
1446 622348f9 Michael Niedermayer
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1447 115329f1 Diego Biurrun
1448 622348f9 Michael Niedermayer
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1449 115329f1 Diego Biurrun
1450 622348f9 Michael Niedermayer
      "subl $2, %%ecx\n"
1451
      "jnz 1b\n"
1452
1453
      "movd %%mm6,%2\n"
1454 115329f1 Diego Biurrun
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1455 053dea12 Aurelien Jacobs
      : "r" ((long)line_size) , "m" (h)
1456 622348f9 Michael Niedermayer
      : "%ecx");
1457
    return tmp;
1458
}
1459
#undef SUM
1460
1461 11f18faf Michael Niedermayer
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1462 053dea12 Aurelien Jacobs
    long i=0;
1463 11f18faf Michael Niedermayer
    asm volatile(
1464 bb270c08 Diego Biurrun
        "1:                             \n\t"
1465
        "movq  (%2, %0), %%mm0          \n\t"
1466
        "movq  (%1, %0), %%mm1          \n\t"
1467
        "psubb %%mm0, %%mm1             \n\t"
1468
        "movq %%mm1, (%3, %0)           \n\t"
1469
        "movq 8(%2, %0), %%mm0          \n\t"
1470
        "movq 8(%1, %0), %%mm1          \n\t"
1471
        "psubb %%mm0, %%mm1             \n\t"
1472
        "movq %%mm1, 8(%3, %0)          \n\t"
1473
        "add $16, %0                    \n\t"
1474
        "cmp %4, %0                     \n\t"
1475
        " jb 1b                         \n\t"
1476 11f18faf Michael Niedermayer
        : "+r" (i)
1477 053dea12 Aurelien Jacobs
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1478 11f18faf Michael Niedermayer
    );
1479
    for(; i<w; i++)
1480
        dst[i+0] = src1[i+0]-src2[i+0];
1481
}
1482 84705403 Michael Niedermayer
1483
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1484 053dea12 Aurelien Jacobs
    long i=0;
1485 84705403 Michael Niedermayer
    uint8_t l, lt;
1486 115329f1 Diego Biurrun
1487 84705403 Michael Niedermayer
    asm volatile(
1488 bb270c08 Diego Biurrun
        "1:                             \n\t"
1489
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1490
        "movq  (%1, %0), %%mm1          \n\t" // T
1491
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1492
        "movq  (%2, %0), %%mm3          \n\t" // X
1493
        "movq %%mm2, %%mm4              \n\t" // L
1494
        "psubb %%mm0, %%mm2             \n\t"
1495
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1496
        "movq %%mm4, %%mm5              \n\t" // L
1497
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1498
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1499
        "pminub %%mm2, %%mm4            \n\t"
1500
        "pmaxub %%mm1, %%mm4            \n\t"
1501
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1502
        "movq %%mm3, (%3, %0)           \n\t"
1503
        "add $8, %0                     \n\t"
1504
        "cmp %4, %0                     \n\t"
1505
        " jb 1b                         \n\t"
1506 84705403 Michael Niedermayer
        : "+r" (i)
1507 053dea12 Aurelien Jacobs
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1508 84705403 Michael Niedermayer
    );
1509
1510
    l= *left;
1511
    lt= *left_top;
1512 115329f1 Diego Biurrun
1513 84705403 Michael Niedermayer
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1514 115329f1 Diego Biurrun
1515 84705403 Michael Niedermayer
    *left_top= src1[w-1];
1516
    *left    = src2[w-1];
1517
}
1518
1519 561f940c Loren Merritt
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1520
    "mov"#m" "#p1", "#a"              \n\t"\
1521
    "mov"#m" "#p2", "#t"              \n\t"\
1522
    "punpcklbw "#a", "#t"             \n\t"\
1523
    "punpcklbw "#a", "#a"             \n\t"\
1524
    "psubw     "#t", "#a"             \n\t"\
1525
1526
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1527
    uint8_t *p1b=p1, *p2b=p2;\
1528
    asm volatile(\
1529
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1530
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1531
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1532
        "add %4, %1                   \n\t"\
1533
        "add %4, %2                   \n\t"\
1534
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1535
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1536
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1537
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1538
        "mov"#m1" "#mm"0, %0          \n\t"\
1539
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1540
        "mov"#m1" %0, "#mm"0          \n\t"\
1541 02d36191 Michael Niedermayer
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1542 561f940c Loren Merritt
        : "r"((long)stride), "r"((long)stride*3)\
1543
    );\
1544
}
1545 02d36191 Michael Niedermayer
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1546 561f940c Loren Merritt
1547
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1548
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1549
1550 8e0a3db7 Michael Niedermayer
#define LBUTTERFLY2(a1,b1,a2,b2)\
1551 bb270c08 Diego Biurrun
    "paddw " #b1 ", " #a1 "           \n\t"\
1552
    "paddw " #b2 ", " #a2 "           \n\t"\
1553
    "paddw " #b1 ", " #b1 "           \n\t"\
1554
    "paddw " #b2 ", " #b2 "           \n\t"\
1555
    "psubw " #a1 ", " #b1 "           \n\t"\
1556
    "psubw " #a2 ", " #b2 "           \n\t"
1557 1457ab52 Michael Niedermayer
1558 561f940c Loren Merritt
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1559
        LBUTTERFLY2(m0, m1, m2, m3)\
1560
        LBUTTERFLY2(m4, m5, m6, m7)\
1561
        LBUTTERFLY2(m0, m2, m1, m3)\
1562
        LBUTTERFLY2(m4, m6, m5, m7)\
1563
        LBUTTERFLY2(m0, m4, m1, m5)\
1564
        LBUTTERFLY2(m2, m6, m3, m7)\
1565
1566
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1567 1457ab52 Michael Niedermayer
1568 5adf43e4 Loren Merritt
#define MMABS_MMX(a,z)\
1569 bb270c08 Diego Biurrun
    "pxor " #z ", " #z "              \n\t"\
1570
    "pcmpgtw " #a ", " #z "           \n\t"\
1571
    "pxor " #z ", " #a "              \n\t"\
1572
    "psubw " #z ", " #a "             \n\t"
1573 1457ab52 Michael Niedermayer
1574 8e0a3db7 Michael Niedermayer
#define MMABS_MMX2(a,z)\
1575 bb270c08 Diego Biurrun
    "pxor " #z ", " #z "              \n\t"\
1576
    "psubw " #a ", " #z "             \n\t"\
1577
    "pmaxsw " #z ", " #a "            \n\t"
1578 8e0a3db7 Michael Niedermayer
1579 561f940c Loren Merritt
#define MMABS_SSSE3(a,z)\
1580
    "pabsw " #a ", " #a "             \n\t"
1581 5adf43e4 Loren Merritt
1582 561f940c Loren Merritt
#define MMABS_SUM(a,z, sum)\
1583
    MMABS(a,z)\
1584 bb270c08 Diego Biurrun
    "paddusw " #a ", " #sum "         \n\t"
1585 115329f1 Diego Biurrun
1586 561f940c Loren Merritt
#define MMABS_SUM_8x8_NOSPILL\
1587
    MMABS(%%xmm0, %%xmm8)\
1588
    MMABS(%%xmm1, %%xmm9)\
1589
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1590
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1591
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1592
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1593
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1594
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1595
    "paddusw %%xmm1, %%xmm0           \n\t"
1596
1597
#ifdef ARCH_X86_64
1598
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1599
#else
1600
#define MMABS_SUM_8x8_SSE2\
1601
    "movdqa %%xmm7, (%1)              \n\t"\
1602
    MMABS(%%xmm0, %%xmm7)\
1603
    MMABS(%%xmm1, %%xmm7)\
1604
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1605
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1606
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1607
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1608
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1609
    "movdqa (%1), %%xmm2              \n\t"\
1610
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1611
    "paddusw %%xmm1, %%xmm0           \n\t"
1612
#endif
1613
1614 1457ab52 Michael Niedermayer
#define LOAD4(o, a, b, c, d)\
1615 561f940c Loren Merritt
    "movq "#o"(%1),    "#a"           \n\t"\
1616
    "movq "#o"+8(%1),  "#b"           \n\t"\
1617
    "movq "#o"+16(%1), "#c"           \n\t"\
1618
    "movq "#o"+24(%1), "#d"           \n\t"\
1619 1457ab52 Michael Niedermayer
1620
#define STORE4(o, a, b, c, d)\
1621 561f940c Loren Merritt
    "movq "#a", "#o"(%1)              \n\t"\
1622
    "movq "#b", "#o"+8(%1)            \n\t"\
1623
    "movq "#c", "#o"+16(%1)           \n\t"\
1624
    "movq "#d", "#o"+24(%1)           \n\t"\
1625 1457ab52 Michael Niedermayer
1626 1edbfe19 Loren Merritt
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1627
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1628
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1629 5adf43e4 Loren Merritt
#define HSUM_MMX(a, t, dst)\
1630
    "movq "#a", "#t"                  \n\t"\
1631
    "psrlq $32, "#a"                  \n\t"\
1632
    "paddusw "#t", "#a"               \n\t"\
1633
    "movq "#a", "#t"                  \n\t"\
1634
    "psrlq $16, "#a"                  \n\t"\
1635
    "paddusw "#t", "#a"               \n\t"\
1636
    "movd "#a", "#dst"                \n\t"\
1637
1638
#define HSUM_MMX2(a, t, dst)\
1639
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1640
    "paddusw "#t", "#a"               \n\t"\
1641
    "pshufw $0x01, "#a", "#t"         \n\t"\
1642
    "paddusw "#t", "#a"               \n\t"\
1643
    "movd "#a", "#dst"                \n\t"\
1644
1645 561f940c Loren Merritt
#define HSUM_SSE2(a, t, dst)\
1646
    "movhlps "#a", "#t"               \n\t"\
1647
    "paddusw "#t", "#a"               \n\t"\
1648
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1649
    "paddusw "#t", "#a"               \n\t"\
1650
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1651
    "paddusw "#t", "#a"               \n\t"\
1652
    "movd "#a", "#dst"                \n\t"\
1653
1654 5adf43e4 Loren Merritt
#define HADAMARD8_DIFF_MMX(cpu) \
1655
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1656 561f940c Loren Merritt
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1657
    int sum;\
1658 5adf43e4 Loren Merritt
\
1659
    assert(h==8);\
1660
\
1661 561f940c Loren Merritt
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1662 5adf43e4 Loren Merritt
\
1663
    asm volatile(\
1664
        HADAMARD48\
1665
\
1666 561f940c Loren Merritt
        "movq %%mm7, 96(%1)             \n\t"\
1667 5adf43e4 Loren Merritt
\
1668
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1669
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1670
\
1671 561f940c Loren Merritt
        "movq 96(%1), %%mm7             \n\t"\
1672 5adf43e4 Loren Merritt
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1673
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1674
\
1675 561f940c Loren Merritt
        : "=r" (sum)\
1676
        : "r"(temp)\
1677
    );\
1678
\
1679
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1680 5adf43e4 Loren Merritt
\
1681 561f940c Loren Merritt
    asm volatile(\
1682 5adf43e4 Loren Merritt
        HADAMARD48\
1683
\
1684 561f940c Loren Merritt
        "movq %%mm7, 96(%1)             \n\t"\
1685 5adf43e4 Loren Merritt
\
1686
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1687 561f940c Loren Merritt
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1688 5adf43e4 Loren Merritt
\
1689 561f940c Loren Merritt
        "movq 96(%1), %%mm7             \n\t"\
1690 5adf43e4 Loren Merritt
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1691
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1692
        "movq %%mm6, %%mm7              \n\t"\
1693
        "movq %%mm0, %%mm6              \n\t"\
1694
\
1695
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1696
\
1697
        HADAMARD48\
1698
        "movq %%mm7, 64(%1)             \n\t"\
1699
        MMABS(%%mm0, %%mm7)\
1700 561f940c Loren Merritt
        MMABS(%%mm1, %%mm7)\
1701 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1702 561f940c Loren Merritt
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1703 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1704 561f940c Loren Merritt
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1705 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1706 561f940c Loren Merritt
        "movq 64(%1), %%mm2             \n\t"\
1707
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1708
        "paddusw %%mm1, %%mm0           \n\t"\
1709 5adf43e4 Loren Merritt
        "movq %%mm0, 64(%1)             \n\t"\
1710
\
1711
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1712 561f940c Loren Merritt
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1713 5adf43e4 Loren Merritt
\
1714
        HADAMARD48\
1715
        "movq %%mm7, (%1)               \n\t"\
1716
        MMABS(%%mm0, %%mm7)\
1717 561f940c Loren Merritt
        MMABS(%%mm1, %%mm7)\
1718 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1719 561f940c Loren Merritt
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1720 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1721 561f940c Loren Merritt
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1722 5adf43e4 Loren Merritt
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1723 561f940c Loren Merritt
        "movq (%1), %%mm2               \n\t"\
1724
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1725
        "paddusw 64(%1), %%mm0          \n\t"\
1726
        "paddusw %%mm1, %%mm0           \n\t"\
1727 5adf43e4 Loren Merritt
\
1728
        HSUM(%%mm0, %%mm1, %0)\
1729
\
1730
        : "=r" (sum)\
1731
        : "r"(temp)\
1732
    );\
1733
    return sum&0xFFFF;\
1734 561f940c Loren Merritt
}\
1735 9fbd14ac Diego Biurrun
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1736 561f940c Loren Merritt
1737
#define HADAMARD8_DIFF_SSE2(cpu) \
1738
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1739
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1740
    int sum;\
1741
\
1742
    assert(h==8);\
1743
\
1744
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1745
\
1746
    asm volatile(\
1747
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1748
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1749
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1750
        MMABS_SUM_8x8\
1751
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1752
        : "=r" (sum)\
1753
        : "r"(temp)\
1754
    );\
1755
    return sum&0xFFFF;\
1756
}\
1757 9fbd14ac Diego Biurrun
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1758 8e0a3db7 Michael Niedermayer
1759 5adf43e4 Loren Merritt
#define MMABS(a,z)         MMABS_MMX(a,z)
1760
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1761
HADAMARD8_DIFF_MMX(mmx)
1762
#undef MMABS
1763
#undef HSUM
1764
1765
#define MMABS(a,z)         MMABS_MMX2(a,z)
1766 561f940c Loren Merritt
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1767 5adf43e4 Loren Merritt
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1768
HADAMARD8_DIFF_MMX(mmx2)
1769 561f940c Loren Merritt
HADAMARD8_DIFF_SSE2(sse2)
1770 5adf43e4 Loren Merritt
#undef MMABS
1771 561f940c Loren Merritt
#undef MMABS_SUM_8x8
1772 5adf43e4 Loren Merritt
#undef HSUM
1773 8e0a3db7 Michael Niedermayer
1774 561f940c Loren Merritt
#ifdef HAVE_SSSE3
1775
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1776
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1777
HADAMARD8_DIFF_SSE2(ssse3)
1778
#undef MMABS
1779
#undef MMABS_SUM_8x8
1780
#endif
1781 59006372 Loren Merritt
1782 1edbfe19 Loren Merritt
#define DCT_SAD4(m,mm,o)\
1783
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1784
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1785
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1786
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1787
    MMABS_SUM(mm##2, mm##6, mm##0)\
1788
    MMABS_SUM(mm##3, mm##7, mm##1)\
1789
    MMABS_SUM(mm##4, mm##6, mm##0)\
1790
    MMABS_SUM(mm##5, mm##7, mm##1)\
1791
1792
#define DCT_SAD_MMX\
1793
    "pxor %%mm0, %%mm0                \n\t"\
1794
    "pxor %%mm1, %%mm1                \n\t"\
1795
    DCT_SAD4(q, %%mm, 0)\
1796
    DCT_SAD4(q, %%mm, 8)\
1797
    DCT_SAD4(q, %%mm, 64)\
1798
    DCT_SAD4(q, %%mm, 72)\
1799
    "paddusw %%mm1, %%mm0             \n\t"\
1800
    HSUM(%%mm0, %%mm1, %0)
1801
1802
#define DCT_SAD_SSE2\
1803
    "pxor %%xmm0, %%xmm0              \n\t"\
1804
    "pxor %%xmm1, %%xmm1              \n\t"\
1805
    DCT_SAD4(dqa, %%xmm, 0)\
1806
    DCT_SAD4(dqa, %%xmm, 64)\
1807
    "paddusw %%xmm1, %%xmm0           \n\t"\
1808
    HSUM(%%xmm0, %%xmm1, %0)
1809
1810
#define DCT_SAD_FUNC(cpu) \
1811
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1812
    int sum;\
1813
    asm volatile(\
1814
        DCT_SAD\
1815
        :"=r"(sum)\
1816
        :"r"(block)\
1817
    );\
1818
    return sum&0xFFFF;\
1819
}
1820
1821
#define DCT_SAD       DCT_SAD_MMX
1822
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1823
#define MMABS(a,z)    MMABS_MMX(a,z)
1824
DCT_SAD_FUNC(mmx)
1825
#undef MMABS
1826
#undef HSUM
1827
1828
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1829
#define MMABS(a,z)    MMABS_MMX2(a,z)
1830
DCT_SAD_FUNC(mmx2)
1831
#undef HSUM
1832
#undef DCT_SAD
1833
1834
#define DCT_SAD       DCT_SAD_SSE2
1835
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1836
DCT_SAD_FUNC(sse2)
1837
#undef MMABS
1838
1839
#ifdef HAVE_SSSE3
1840
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1841
DCT_SAD_FUNC(ssse3)
1842
#undef MMABS
1843
#endif
1844
#undef HSUM
1845
#undef DCT_SAD
1846
1847 a00177a9 Måns Rullgård
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1848 59006372 Loren Merritt
    int sum;
1849
    long i=size;
1850
    asm volatile(
1851
        "pxor %%mm4, %%mm4 \n"
1852
        "1: \n"
1853
        "sub $8, %0 \n"
1854
        "movq (%2,%0), %%mm2 \n"
1855
        "movq (%3,%0,2), %%mm0 \n"
1856
        "movq 8(%3,%0,2), %%mm1 \n"
1857
        "punpckhbw %%mm2, %%mm3 \n"
1858
        "punpcklbw %%mm2, %%mm2 \n"
1859
        "psraw $8, %%mm3 \n"
1860
        "psraw $8, %%mm2 \n"
1861
        "psubw %%mm3, %%mm1 \n"
1862
        "psubw %%mm2, %%mm0 \n"
1863
        "pmaddwd %%mm1, %%mm1 \n"
1864
        "pmaddwd %%mm0, %%mm0 \n"
1865
        "paddd %%mm1, %%mm4 \n"
1866
        "paddd %%mm0, %%mm4 \n"
1867
        "jg 1b \n"
1868
        "movq %%mm4, %%mm3 \n"
1869
        "psrlq $32, %%mm3 \n"
1870
        "paddd %%mm3, %%mm4 \n"
1871
        "movd %%mm4, %1 \n"
1872
        :"+r"(i), "=r"(sum)
1873
        :"r"(pix1), "r"(pix2)
1874
    );
1875
    return sum;
1876
}
1877
1878 764ef400 Mike Melanson
#endif //CONFIG_ENCODERS
1879 11f18faf Michael Niedermayer
1880 826f429a Michael Niedermayer
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1881 bb270c08 Diego Biurrun
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1882
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1883
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1884
        "movq "#in7", " #m3 "             \n\t" /* d */\
1885
        "movq "#in0", %%mm5               \n\t" /* D */\
1886
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1887
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1888
        "movq "#in1", %%mm5               \n\t" /* C */\
1889
        "movq "#in2", %%mm6               \n\t" /* B */\
1890
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1891
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1892
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1893
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1894
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1895
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1896
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1897
        "psraw $5, %%mm5                  \n\t"\
1898
        "packuswb %%mm5, %%mm5            \n\t"\
1899 826f429a Michael Niedermayer
        OP(%%mm5, out, %%mm7, d)
1900
1901 3178ee4c Michael Niedermayer
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1902 5c91a675 Zdenek Kabelac
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1903 826f429a Michael Niedermayer
    uint64_t temp;\
1904
\
1905
    asm volatile(\
1906 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7                \n\t"\
1907
        "1:                               \n\t"\
1908
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1909
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1910
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1911
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1912
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1913
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1914
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1915
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1916
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1917
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1918
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1919
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1920
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1921
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1922
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1923
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1924
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1925
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1926
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1927
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1928
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1929
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1930
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1931
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1932
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1933
        "paddw %6, %%mm6                  \n\t"\
1934
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1935
        "psraw $5, %%mm0                  \n\t"\
1936
        "movq %%mm0, %5                   \n\t"\
1937 826f429a Michael Niedermayer
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1938
        \
1939 bb270c08 Diego Biurrun
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1940
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1941
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1942
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1943
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1944
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1945
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1946
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1947
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1948
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1949
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1950
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1951
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1952
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1953
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1954
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1955
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1956
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1957
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1958
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
1959
        "paddw %6, %%mm1                  \n\t"\
1960
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
1961
        "psraw $5, %%mm3                  \n\t"\
1962
        "movq %5, %%mm1                   \n\t"\
1963
        "packuswb %%mm3, %%mm1            \n\t"\
1964 3178ee4c Michael Niedermayer
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
1965 826f429a Michael Niedermayer
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1966
        \
1967 bb270c08 Diego Biurrun
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
1968
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
1969
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
1970
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
1971
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
1972
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
1973
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
1974
        "paddw %%mm1, %%mm5               \n\t" /* b */\
1975
        "paddw %%mm4, %%mm0               \n\t" /* c */\
1976
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1977
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
1978
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
1979
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
1980
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
1981
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
1982
        "paddw %%mm3, %%mm2               \n\t" /* d */\
1983
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
1984
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
1985
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
1986
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
1987
        "paddw %%mm2, %%mm6               \n\t" /* a */\
1988
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1989
        "paddw %6, %%mm0                  \n\t"\
1990
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1991
        "psraw $5, %%mm0                  \n\t"\
1992 826f429a Michael Niedermayer
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1993
        \
1994 bb270c08 Diego Biurrun
        "paddw %%mm5, %%mm3               \n\t" /* a */\
1995
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
1996
        "paddw %%mm4, %%mm6               \n\t" /* b */\
1997
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
1998
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
1999
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2000
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2001
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2002
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2003
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2004
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2005
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2006
        "paddw %6, %%mm4                  \n\t"\
2007
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2008
        "psraw $5, %%mm4                  \n\t"\
2009
        "packuswb %%mm4, %%mm0            \n\t"\
2010 3178ee4c Michael Niedermayer
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2011 826f429a Michael Niedermayer
        \
2012 bb270c08 Diego Biurrun
        "add %3, %0                       \n\t"\
2013
        "add %4, %1                       \n\t"\
2014
        "decl %2                          \n\t"\
2015
        " jnz 1b                          \n\t"\
2016 5a508a98 Michael Niedermayer
        : "+a"(src), "+c"(dst), "+m"(h)\
2017 053dea12 Aurelien Jacobs
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2018 0b093b6f Michael Niedermayer
        : "memory"\
2019 826f429a Michael Niedermayer
    );\
2020
}\
2021
\
2022
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2023
    int i;\
2024
    int16_t temp[16];\
2025
    /* quick HACK, XXX FIXME MUST be optimized */\
2026
    for(i=0; i<h; i++)\
2027
    {\
2028
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2029
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2030
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2031
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2032
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2033
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2034
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2035
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2036
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2037
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2038
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2039
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2040
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2041
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2042
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2043
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2044
        asm volatile(\
2045 bb270c08 Diego Biurrun
            "movq (%0), %%mm0               \n\t"\
2046
            "movq 8(%0), %%mm1              \n\t"\
2047
            "paddw %2, %%mm0                \n\t"\
2048
            "paddw %2, %%mm1                \n\t"\
2049
            "psraw $5, %%mm0                \n\t"\
2050
            "psraw $5, %%mm1                \n\t"\
2051
            "packuswb %%mm1, %%mm0          \n\t"\
2052 3178ee4c Michael Niedermayer
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2053 bb270c08 Diego Biurrun
            "movq 16(%0), %%mm0             \n\t"\
2054
            "movq 24(%0), %%mm1             \n\t"\
2055
            "paddw %2, %%mm0                \n\t"\
2056
            "paddw %2, %%mm1                \n\t"\
2057
            "psraw $5, %%mm0                \n\t"\
2058
            "psraw $5, %%mm1                \n\t"\
2059
            "packuswb %%mm1, %%mm0          \n\t"\
2060 3178ee4c Michael Niedermayer
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2061 826f429a Michael Niedermayer
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2062 0b093b6f Michael Niedermayer
            : "memory"\
2063 826f429a Michael Niedermayer
        );\
2064
        dst+=dstStride;\
2065
        src+=srcStride;\
2066
    }\
2067
}\
2068
\
2069 5c91a675 Zdenek Kabelac
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2070 826f429a Michael Niedermayer
    uint64_t temp;\
2071
\
2072
    asm volatile(\
2073 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7                \n\t"\
2074
        "1:                               \n\t"\
2075
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2076
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2077
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2078
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2079
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2080
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2081
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2082
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2083
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2084
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2085
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2086
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2087
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2088
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2089
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2090
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2091
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2092
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2093
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2094
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2095
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2096
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2097
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2098
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2099
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2100
        "paddw %6, %%mm6                  \n\t"\
2101
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2102
        "psraw $5, %%mm0                  \n\t"\
2103 826f429a Michael Niedermayer
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2104
        \
2105 bb270c08 Diego Biurrun
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2106
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2107
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2108
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2109
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2110
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2111
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2112
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2113
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2114
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2115
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2116
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2117
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2118
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2119
        "paddw %6, %%mm1                  \n\t"\
2120
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2121
        "psraw $5, %%mm3                  \n\t"\
2122
        "packuswb %%mm3, %%mm0            \n\t"\
2123 3178ee4c Michael Niedermayer
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2124 826f429a Michael Niedermayer
        \
2125 bb270c08 Diego Biurrun
        "add %3, %0                       \n\t"\
2126
        "add %4, %1                       \n\t"\
2127
        "decl %2                          \n\t"\
2128
        " jnz 1b                          \n\t"\
2129 5a508a98 Michael Niedermayer
        : "+a"(src), "+c"(dst), "+m"(h)\
2130 053dea12 Aurelien Jacobs
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2131 0b093b6f Michael Niedermayer
        : "memory"\
2132 826f429a Michael Niedermayer
    );\
2133
}\
2134
\
2135
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2136
    int i;\
2137
    int16_t temp[8];\
2138
    /* quick HACK, XXX FIXME MUST be optimized */\
2139
    for(i=0; i<h; i++)\
2140
    {\
2141
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2142
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2143
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2144
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2145
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2146
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2147
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2148
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2149
        asm volatile(\
2150 bb270c08 Diego Biurrun
            "movq (%0), %%mm0           \n\t"\
2151
            "movq 8(%0), %%mm1          \n\t"\
2152
            "paddw %2, %%mm0            \n\t"\
2153
            "paddw %2, %%mm1            \n\t"\
2154
            "psraw $5, %%mm0            \n\t"\
2155
            "psraw $5, %%mm1            \n\t"\
2156
            "packuswb %%mm1, %%mm0      \n\t"\
2157 3178ee4c Michael Niedermayer
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2158 826f429a Michael Niedermayer
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2159 0b093b6f Michael Niedermayer
            :"memory"\
2160 826f429a Michael Niedermayer
        );\
2161
        dst+=dstStride;\
2162
        src+=srcStride;\
2163
    }\
2164 3178ee4c Michael Niedermayer
}
2165
2166
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2167
\
2168
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2169
    uint64_t temp[17*4];\
2170
    uint64_t *temp_ptr= temp;\
2171
    int count= 17;\
2172
\
2173
    /*FIXME unroll */\
2174
    asm volatile(\
2175 bb270c08 Diego Biurrun
        "pxor %%mm7, %%mm7              \n\t"\
2176
        "1:                             \n\t"\
2177
        "movq (%0), %%mm0               \n\t"\
2178
        "movq (%0), %%mm1               \n\t"\
2179
        "movq 8(%0), %%mm2              \n\t"\
2180
        "movq 8(%0), %%mm3              \n\t"\
2181
        "punpcklbw %%mm7, %%mm0         \n\t"\
2182
        "punpckhbw %%mm7, %%mm1         \n\t"\
2183
        "punpcklbw %%mm7, %%mm2         \n\t"\
2184
        "punpckhbw %%mm7, %%mm3         \n\t"\
2185
        "movq %%mm0, (%1)               \n\t"\
2186
        "movq %%mm1, 17*8(%1)           \n\t"\
2187
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2188
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2189
        "add $8, %1                     \n\t"\
2190
        "add %3, %0                     \n\t"\
2191
        "decl %2                        \n\t"\
2192
        " jnz 1b                        \n\t"\
2193 3178ee4c Michael Niedermayer
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2194 053dea12 Aurelien Jacobs
        : "r" ((long)srcStride)\
2195 0b093b6f Michael Niedermayer
        : "memory"\
2196 3178ee4c Michael Niedermayer
    );\
2197
    \
2198
    temp_ptr= temp;\
2199
    count=4;\
2200
    \
2201
/*FIXME reorder for speed */\
2202
    asm volatile(\
2203 bb270c08 Diego Biurrun
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2204
        "1:                             \n\t"\
2205
        "movq (%0), %%mm0               \n\t"\
2206
        "movq 8(%0), %%mm1              \n\t"\
2207
        "movq 16(%0), %%mm2             \n\t"\
2208
        "movq 24(%0), %%mm3             \n\t"\
2209 c296f66b Michael Niedermayer
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2210