Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 1d67b037

History | View | Annotate | Download (136 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "dsputil.h"
26
#include "dsputil_mmx.h"
27
#include "simple_idct.h"
28
#include "mpegvideo.h"
29
#include "x86_cpu.h"
30
#include "mmx.h"
31
#include "vp3dsp_mmx.h"
32
#include "vp3dsp_sse2.h"
33
#include "h263.h"
34

    
35
//#undef NDEBUG
36
//#include <assert.h>
37

    
38
extern void ff_idct_xvid_mmx(short *block);
39
extern void ff_idct_xvid_mmx2(short *block);
40

    
41
int mm_flags; /* multimedia extension flags */
42

    
43
/* pixel operations */
44
DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45
DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
46

    
47
DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48
{0x8000000080000000ULL, 0x8000000080000000ULL};
49

    
50
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
51
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
52
DECLARE_ALIGNED_16(const xmm_t,    ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
53
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8  ) = 0x0008000800080008ULL;
54
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55
DECLARE_ALIGNED_16(const xmm_t,    ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
56
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57
DECLARE_ALIGNED_16(const xmm_t,    ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
58
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62

    
63
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
64
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
65
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
66
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
67
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
68
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
69

    
70
DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71
DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
72

    
73
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
75

    
76
#define MOVQ_WONE(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79
    "psrlw $15, %%" #regd ::)
80

    
81
#define MOVQ_BFE(regd) \
82
    __asm __volatile ( \
83
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
85

    
86
#ifndef PIC
87
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
89
#else
90
// for shared library it's better to use this way for accessing constants
91
// pcmpeqd -> -1
92
#define MOVQ_BONE(regd) \
93
    __asm __volatile ( \
94
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95
    "psrlw $15, %%" #regd " \n\t" \
96
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
97

    
98
#define MOVQ_WTWO(regd) \
99
    __asm __volatile ( \
100
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101
    "psrlw $15, %%" #regd " \n\t" \
102
    "psllw $1, %%" #regd " \n\t"::)
103

    
104
#endif
105

    
106
// using regr as temporary and for the output result
107
// first argument is unmodifed and second is trashed
108
// regfe is supposed to contain 0xfefefefefefefefe
109
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110
    "movq " #rega ", " #regr "  \n\t"\
111
    "pand " #regb ", " #regr "  \n\t"\
112
    "pxor " #rega ", " #regb "  \n\t"\
113
    "pand " #regfe "," #regb "  \n\t"\
114
    "psrlq $1, " #regb "        \n\t"\
115
    "paddb " #regb ", " #regr " \n\t"
116

    
117
#define PAVGB_MMX(rega, regb, regr, regfe) \
118
    "movq " #rega ", " #regr "  \n\t"\
119
    "por  " #regb ", " #regr "  \n\t"\
120
    "pxor " #rega ", " #regb "  \n\t"\
121
    "pand " #regfe "," #regb "  \n\t"\
122
    "psrlq $1, " #regb "        \n\t"\
123
    "psubb " #regb ", " #regr " \n\t"
124

    
125
// mm6 is supposed to contain 0xfefefefefefefefe
126
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
127
    "movq " #rega ", " #regr "  \n\t"\
128
    "movq " #regc ", " #regp "  \n\t"\
129
    "pand " #regb ", " #regr "  \n\t"\
130
    "pand " #regd ", " #regp "  \n\t"\
131
    "pxor " #rega ", " #regb "  \n\t"\
132
    "pxor " #regc ", " #regd "  \n\t"\
133
    "pand %%mm6, " #regb "      \n\t"\
134
    "pand %%mm6, " #regd "      \n\t"\
135
    "psrlq $1, " #regb "        \n\t"\
136
    "psrlq $1, " #regd "        \n\t"\
137
    "paddb " #regb ", " #regr " \n\t"\
138
    "paddb " #regd ", " #regp " \n\t"
139

    
140
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141
    "movq " #rega ", " #regr "  \n\t"\
142
    "movq " #regc ", " #regp "  \n\t"\
143
    "por  " #regb ", " #regr "  \n\t"\
144
    "por  " #regd ", " #regp "  \n\t"\
145
    "pxor " #rega ", " #regb "  \n\t"\
146
    "pxor " #regc ", " #regd "  \n\t"\
147
    "pand %%mm6, " #regb "      \n\t"\
148
    "pand %%mm6, " #regd "      \n\t"\
149
    "psrlq $1, " #regd "        \n\t"\
150
    "psrlq $1, " #regb "        \n\t"\
151
    "psubb " #regb ", " #regr " \n\t"\
152
    "psubb " #regd ", " #regp " \n\t"
153

    
154
/***********************************/
155
/* MMX no rounding */
156
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157
#define SET_RND  MOVQ_WONE
158
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
160

    
161
#include "dsputil_mmx_rnd.h"
162

    
163
#undef DEF
164
#undef SET_RND
165
#undef PAVGBP
166
#undef PAVGB
167
/***********************************/
168
/* MMX rounding */
169

    
170
#define DEF(x, y) x ## _ ## y ##_mmx
171
#define SET_RND  MOVQ_WTWO
172
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
173
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
174

    
175
#include "dsputil_mmx_rnd.h"
176

    
177
#undef DEF
178
#undef SET_RND
179
#undef PAVGBP
180
#undef PAVGB
181

    
182
/***********************************/
183
/* 3Dnow specific */
184

    
185
#define DEF(x) x ## _3dnow
186
#define PAVGB "pavgusb"
187

    
188
#include "dsputil_mmx_avg.h"
189

    
190
#undef DEF
191
#undef PAVGB
192

    
193
/***********************************/
194
/* MMX2 specific */
195

    
196
#define DEF(x) x ## _mmx2
197

    
198
/* Introduced only in MMX2 set */
199
#define PAVGB "pavgb"
200

    
201
#include "dsputil_mmx_avg.h"
202

    
203
#undef DEF
204
#undef PAVGB
205

    
206
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
207
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
208
#define put_pixels16_mmx2 put_pixels16_mmx
209
#define put_pixels8_mmx2 put_pixels8_mmx
210
#define put_pixels4_mmx2 put_pixels4_mmx
211
#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
212
#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
213
#define put_pixels16_3dnow put_pixels16_mmx
214
#define put_pixels8_3dnow put_pixels8_mmx
215
#define put_pixels4_3dnow put_pixels4_mmx
216
#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
217
#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
218

    
219
/***********************************/
220
/* standard MMX */
221

    
222
#ifdef CONFIG_ENCODERS
223
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
224
{
225
    asm volatile(
226
        "mov $-128, %%"REG_a"           \n\t"
227
        "pxor %%mm7, %%mm7              \n\t"
228
        ASMALIGN(4)
229
        "1:                             \n\t"
230
        "movq (%0), %%mm0               \n\t"
231
        "movq (%0, %2), %%mm2           \n\t"
232
        "movq %%mm0, %%mm1              \n\t"
233
        "movq %%mm2, %%mm3              \n\t"
234
        "punpcklbw %%mm7, %%mm0         \n\t"
235
        "punpckhbw %%mm7, %%mm1         \n\t"
236
        "punpcklbw %%mm7, %%mm2         \n\t"
237
        "punpckhbw %%mm7, %%mm3         \n\t"
238
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
239
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
240
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
241
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
242
        "add %3, %0                     \n\t"
243
        "add $32, %%"REG_a"             \n\t"
244
        "js 1b                          \n\t"
245
        : "+r" (pixels)
246
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
247
        : "%"REG_a
248
    );
249
}
250

    
251
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
252
{
253
    asm volatile(
254
        "pxor %%mm7, %%mm7              \n\t"
255
        "mov $-128, %%"REG_a"           \n\t"
256
        ASMALIGN(4)
257
        "1:                             \n\t"
258
        "movq (%0), %%mm0               \n\t"
259
        "movq (%1), %%mm2               \n\t"
260
        "movq %%mm0, %%mm1              \n\t"
261
        "movq %%mm2, %%mm3              \n\t"
262
        "punpcklbw %%mm7, %%mm0         \n\t"
263
        "punpckhbw %%mm7, %%mm1         \n\t"
264
        "punpcklbw %%mm7, %%mm2         \n\t"
265
        "punpckhbw %%mm7, %%mm3         \n\t"
266
        "psubw %%mm2, %%mm0             \n\t"
267
        "psubw %%mm3, %%mm1             \n\t"
268
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
269
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
270
        "add %3, %0                     \n\t"
271
        "add %3, %1                     \n\t"
272
        "add $16, %%"REG_a"             \n\t"
273
        "jnz 1b                         \n\t"
274
        : "+r" (s1), "+r" (s2)
275
        : "r" (block+64), "r" ((long)stride)
276
        : "%"REG_a
277
    );
278
}
279
#endif //CONFIG_ENCODERS
280

    
281
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
282
{
283
    const DCTELEM *p;
284
    uint8_t *pix;
285

    
286
    /* read the pixels */
287
    p = block;
288
    pix = pixels;
289
    /* unrolled loop */
290
        __asm __volatile(
291
                "movq   %3, %%mm0               \n\t"
292
                "movq   8%3, %%mm1              \n\t"
293
                "movq   16%3, %%mm2             \n\t"
294
                "movq   24%3, %%mm3             \n\t"
295
                "movq   32%3, %%mm4             \n\t"
296
                "movq   40%3, %%mm5             \n\t"
297
                "movq   48%3, %%mm6             \n\t"
298
                "movq   56%3, %%mm7             \n\t"
299
                "packuswb %%mm1, %%mm0          \n\t"
300
                "packuswb %%mm3, %%mm2          \n\t"
301
                "packuswb %%mm5, %%mm4          \n\t"
302
                "packuswb %%mm7, %%mm6          \n\t"
303
                "movq   %%mm0, (%0)             \n\t"
304
                "movq   %%mm2, (%0, %1)         \n\t"
305
                "movq   %%mm4, (%0, %1, 2)      \n\t"
306
                "movq   %%mm6, (%0, %2)         \n\t"
307
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
308
                :"memory");
309
        pix += line_size*4;
310
        p += 32;
311

    
312
    // if here would be an exact copy of the code above
313
    // compiler would generate some very strange code
314
    // thus using "r"
315
    __asm __volatile(
316
            "movq       (%3), %%mm0             \n\t"
317
            "movq       8(%3), %%mm1            \n\t"
318
            "movq       16(%3), %%mm2           \n\t"
319
            "movq       24(%3), %%mm3           \n\t"
320
            "movq       32(%3), %%mm4           \n\t"
321
            "movq       40(%3), %%mm5           \n\t"
322
            "movq       48(%3), %%mm6           \n\t"
323
            "movq       56(%3), %%mm7           \n\t"
324
            "packuswb %%mm1, %%mm0              \n\t"
325
            "packuswb %%mm3, %%mm2              \n\t"
326
            "packuswb %%mm5, %%mm4              \n\t"
327
            "packuswb %%mm7, %%mm6              \n\t"
328
            "movq       %%mm0, (%0)             \n\t"
329
            "movq       %%mm2, (%0, %1)         \n\t"
330
            "movq       %%mm4, (%0, %1, 2)      \n\t"
331
            "movq       %%mm6, (%0, %2)         \n\t"
332
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
333
            :"memory");
334
}
335

    
336
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
337
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
338

    
339
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
340
{
341
    int i;
342

    
343
    movq_m2r(*vector128, mm1);
344
    for (i = 0; i < 8; i++) {
345
        movq_m2r(*(block), mm0);
346
        packsswb_m2r(*(block + 4), mm0);
347
        block += 8;
348
        paddb_r2r(mm1, mm0);
349
        movq_r2m(mm0, *pixels);
350
        pixels += line_size;
351
    }
352
}
353

    
354
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
355
{
356
    const DCTELEM *p;
357
    uint8_t *pix;
358
    int i;
359

    
360
    /* read the pixels */
361
    p = block;
362
    pix = pixels;
363
    MOVQ_ZERO(mm7);
364
    i = 4;
365
    do {
366
        __asm __volatile(
367
                "movq   (%2), %%mm0     \n\t"
368
                "movq   8(%2), %%mm1    \n\t"
369
                "movq   16(%2), %%mm2   \n\t"
370
                "movq   24(%2), %%mm3   \n\t"
371
                "movq   %0, %%mm4       \n\t"
372
                "movq   %1, %%mm6       \n\t"
373
                "movq   %%mm4, %%mm5    \n\t"
374
                "punpcklbw %%mm7, %%mm4 \n\t"
375
                "punpckhbw %%mm7, %%mm5 \n\t"
376
                "paddsw %%mm4, %%mm0    \n\t"
377
                "paddsw %%mm5, %%mm1    \n\t"
378
                "movq   %%mm6, %%mm5    \n\t"
379
                "punpcklbw %%mm7, %%mm6 \n\t"
380
                "punpckhbw %%mm7, %%mm5 \n\t"
381
                "paddsw %%mm6, %%mm2    \n\t"
382
                "paddsw %%mm5, %%mm3    \n\t"
383
                "packuswb %%mm1, %%mm0  \n\t"
384
                "packuswb %%mm3, %%mm2  \n\t"
385
                "movq   %%mm0, %0       \n\t"
386
                "movq   %%mm2, %1       \n\t"
387
                :"+m"(*pix), "+m"(*(pix+line_size))
388
                :"r"(p)
389
                :"memory");
390
        pix += line_size*2;
391
        p += 16;
392
    } while (--i);
393
}
394

    
395
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
396
{
397
    __asm __volatile(
398
         "lea (%3, %3), %%"REG_a"       \n\t"
399
         ASMALIGN(3)
400
         "1:                            \n\t"
401
         "movd (%1), %%mm0              \n\t"
402
         "movd (%1, %3), %%mm1          \n\t"
403
         "movd %%mm0, (%2)              \n\t"
404
         "movd %%mm1, (%2, %3)          \n\t"
405
         "add %%"REG_a", %1             \n\t"
406
         "add %%"REG_a", %2             \n\t"
407
         "movd (%1), %%mm0              \n\t"
408
         "movd (%1, %3), %%mm1          \n\t"
409
         "movd %%mm0, (%2)              \n\t"
410
         "movd %%mm1, (%2, %3)          \n\t"
411
         "add %%"REG_a", %1             \n\t"
412
         "add %%"REG_a", %2             \n\t"
413
         "subl $4, %0                   \n\t"
414
         "jnz 1b                        \n\t"
415
         : "+g"(h), "+r" (pixels),  "+r" (block)
416
         : "r"((long)line_size)
417
         : "%"REG_a, "memory"
418
        );
419
}
420

    
421
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
422
{
423
    __asm __volatile(
424
         "lea (%3, %3), %%"REG_a"       \n\t"
425
         ASMALIGN(3)
426
         "1:                            \n\t"
427
         "movq (%1), %%mm0              \n\t"
428
         "movq (%1, %3), %%mm1          \n\t"
429
         "movq %%mm0, (%2)              \n\t"
430
         "movq %%mm1, (%2, %3)          \n\t"
431
         "add %%"REG_a", %1             \n\t"
432
         "add %%"REG_a", %2             \n\t"
433
         "movq (%1), %%mm0              \n\t"
434
         "movq (%1, %3), %%mm1          \n\t"
435
         "movq %%mm0, (%2)              \n\t"
436
         "movq %%mm1, (%2, %3)          \n\t"
437
         "add %%"REG_a", %1             \n\t"
438
         "add %%"REG_a", %2             \n\t"
439
         "subl $4, %0                   \n\t"
440
         "jnz 1b                        \n\t"
441
         : "+g"(h), "+r" (pixels),  "+r" (block)
442
         : "r"((long)line_size)
443
         : "%"REG_a, "memory"
444
        );
445
}
446

    
447
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
448
{
449
    __asm __volatile(
450
         "lea (%3, %3), %%"REG_a"       \n\t"
451
         ASMALIGN(3)
452
         "1:                            \n\t"
453
         "movq (%1), %%mm0              \n\t"
454
         "movq 8(%1), %%mm4             \n\t"
455
         "movq (%1, %3), %%mm1          \n\t"
456
         "movq 8(%1, %3), %%mm5         \n\t"
457
         "movq %%mm0, (%2)              \n\t"
458
         "movq %%mm4, 8(%2)             \n\t"
459
         "movq %%mm1, (%2, %3)          \n\t"
460
         "movq %%mm5, 8(%2, %3)         \n\t"
461
         "add %%"REG_a", %1             \n\t"
462
         "add %%"REG_a", %2             \n\t"
463
         "movq (%1), %%mm0              \n\t"
464
         "movq 8(%1), %%mm4             \n\t"
465
         "movq (%1, %3), %%mm1          \n\t"
466
         "movq 8(%1, %3), %%mm5         \n\t"
467
         "movq %%mm0, (%2)              \n\t"
468
         "movq %%mm4, 8(%2)             \n\t"
469
         "movq %%mm1, (%2, %3)          \n\t"
470
         "movq %%mm5, 8(%2, %3)         \n\t"
471
         "add %%"REG_a", %1             \n\t"
472
         "add %%"REG_a", %2             \n\t"
473
         "subl $4, %0                   \n\t"
474
         "jnz 1b                        \n\t"
475
         : "+g"(h), "+r" (pixels),  "+r" (block)
476
         : "r"((long)line_size)
477
         : "%"REG_a, "memory"
478
        );
479
}
480

    
481
static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
482
{
483
    __asm __volatile(
484
         "1:                            \n\t"
485
         "movdqu (%1), %%xmm0           \n\t"
486
         "movdqu (%1,%3), %%xmm1        \n\t"
487
         "movdqu (%1,%3,2), %%xmm2      \n\t"
488
         "movdqu (%1,%4), %%xmm3        \n\t"
489
         "movdqa %%xmm0, (%2)           \n\t"
490
         "movdqa %%xmm1, (%2,%3)        \n\t"
491
         "movdqa %%xmm2, (%2,%3,2)      \n\t"
492
         "movdqa %%xmm3, (%2,%4)        \n\t"
493
         "subl $4, %0                   \n\t"
494
         "lea (%1,%3,4), %1             \n\t"
495
         "lea (%2,%3,4), %2             \n\t"
496
         "jnz 1b                        \n\t"
497
         : "+g"(h), "+r" (pixels),  "+r" (block)
498
         : "r"((long)line_size), "r"(3L*line_size)
499
         : "memory"
500
        );
501
}
502

    
503
static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
504
{
505
    __asm __volatile(
506
         "1:                            \n\t"
507
         "movdqu (%1), %%xmm0           \n\t"
508
         "movdqu (%1,%3), %%xmm1        \n\t"
509
         "movdqu (%1,%3,2), %%xmm2      \n\t"
510
         "movdqu (%1,%4), %%xmm3        \n\t"
511
         "pavgb  (%2), %%xmm0           \n\t"
512
         "pavgb  (%2,%3), %%xmm1        \n\t"
513
         "pavgb  (%2,%3,2), %%xmm2      \n\t"
514
         "pavgb  (%2,%4), %%xmm3        \n\t"
515
         "movdqa %%xmm0, (%2)           \n\t"
516
         "movdqa %%xmm1, (%2,%3)        \n\t"
517
         "movdqa %%xmm2, (%2,%3,2)      \n\t"
518
         "movdqa %%xmm3, (%2,%4)        \n\t"
519
         "subl $4, %0                   \n\t"
520
         "lea (%1,%3,4), %1             \n\t"
521
         "lea (%2,%3,4), %2             \n\t"
522
         "jnz 1b                        \n\t"
523
         : "+g"(h), "+r" (pixels),  "+r" (block)
524
         : "r"((long)line_size), "r"(3L*line_size)
525
         : "memory"
526
        );
527
}
528

    
529
static void clear_blocks_mmx(DCTELEM *blocks)
530
{
531
    __asm __volatile(
532
                "pxor %%mm7, %%mm7              \n\t"
533
                "mov $-128*6, %%"REG_a"         \n\t"
534
                "1:                             \n\t"
535
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
536
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
537
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
538
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
539
                "add $32, %%"REG_a"             \n\t"
540
                " js 1b                         \n\t"
541
                : : "r" (((uint8_t *)blocks)+128*6)
542
                : "%"REG_a
543
        );
544
}
545

    
546
#ifdef CONFIG_ENCODERS
547
static int pix_sum16_mmx(uint8_t * pix, int line_size){
548
    const int h=16;
549
    int sum;
550
    long index= -line_size*h;
551

    
552
    __asm __volatile(
553
                "pxor %%mm7, %%mm7              \n\t"
554
                "pxor %%mm6, %%mm6              \n\t"
555
                "1:                             \n\t"
556
                "movq (%2, %1), %%mm0           \n\t"
557
                "movq (%2, %1), %%mm1           \n\t"
558
                "movq 8(%2, %1), %%mm2          \n\t"
559
                "movq 8(%2, %1), %%mm3          \n\t"
560
                "punpcklbw %%mm7, %%mm0         \n\t"
561
                "punpckhbw %%mm7, %%mm1         \n\t"
562
                "punpcklbw %%mm7, %%mm2         \n\t"
563
                "punpckhbw %%mm7, %%mm3         \n\t"
564
                "paddw %%mm0, %%mm1             \n\t"
565
                "paddw %%mm2, %%mm3             \n\t"
566
                "paddw %%mm1, %%mm3             \n\t"
567
                "paddw %%mm3, %%mm6             \n\t"
568
                "add %3, %1                     \n\t"
569
                " js 1b                         \n\t"
570
                "movq %%mm6, %%mm5              \n\t"
571
                "psrlq $32, %%mm6               \n\t"
572
                "paddw %%mm5, %%mm6             \n\t"
573
                "movq %%mm6, %%mm5              \n\t"
574
                "psrlq $16, %%mm6               \n\t"
575
                "paddw %%mm5, %%mm6             \n\t"
576
                "movd %%mm6, %0                 \n\t"
577
                "andl $0xFFFF, %0               \n\t"
578
                : "=&r" (sum), "+r" (index)
579
                : "r" (pix - index), "r" ((long)line_size)
580
        );
581

    
582
        return sum;
583
}
584
#endif //CONFIG_ENCODERS
585

    
586
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
587
    long i=0;
588
    asm volatile(
589
        "1:                             \n\t"
590
        "movq  (%1, %0), %%mm0          \n\t"
591
        "movq  (%2, %0), %%mm1          \n\t"
592
        "paddb %%mm0, %%mm1             \n\t"
593
        "movq %%mm1, (%2, %0)           \n\t"
594
        "movq 8(%1, %0), %%mm0          \n\t"
595
        "movq 8(%2, %0), %%mm1          \n\t"
596
        "paddb %%mm0, %%mm1             \n\t"
597
        "movq %%mm1, 8(%2, %0)          \n\t"
598
        "add $16, %0                    \n\t"
599
        "cmp %3, %0                     \n\t"
600
        " jb 1b                         \n\t"
601
        : "+r" (i)
602
        : "r"(src), "r"(dst), "r"((long)w-15)
603
    );
604
    for(; i<w; i++)
605
        dst[i+0] += src[i+0];
606
}
607

    
608
#define H263_LOOP_FILTER \
609
        "pxor %%mm7, %%mm7              \n\t"\
610
        "movq  %0, %%mm0                \n\t"\
611
        "movq  %0, %%mm1                \n\t"\
612
        "movq  %3, %%mm2                \n\t"\
613
        "movq  %3, %%mm3                \n\t"\
614
        "punpcklbw %%mm7, %%mm0         \n\t"\
615
        "punpckhbw %%mm7, %%mm1         \n\t"\
616
        "punpcklbw %%mm7, %%mm2         \n\t"\
617
        "punpckhbw %%mm7, %%mm3         \n\t"\
618
        "psubw %%mm2, %%mm0             \n\t"\
619
        "psubw %%mm3, %%mm1             \n\t"\
620
        "movq  %1, %%mm2                \n\t"\
621
        "movq  %1, %%mm3                \n\t"\
622
        "movq  %2, %%mm4                \n\t"\
623
        "movq  %2, %%mm5                \n\t"\
624
        "punpcklbw %%mm7, %%mm2         \n\t"\
625
        "punpckhbw %%mm7, %%mm3         \n\t"\
626
        "punpcklbw %%mm7, %%mm4         \n\t"\
627
        "punpckhbw %%mm7, %%mm5         \n\t"\
628
        "psubw %%mm2, %%mm4             \n\t"\
629
        "psubw %%mm3, %%mm5             \n\t"\
630
        "psllw $2, %%mm4                \n\t"\
631
        "psllw $2, %%mm5                \n\t"\
632
        "paddw %%mm0, %%mm4             \n\t"\
633
        "paddw %%mm1, %%mm5             \n\t"\
634
        "pxor %%mm6, %%mm6              \n\t"\
635
        "pcmpgtw %%mm4, %%mm6           \n\t"\
636
        "pcmpgtw %%mm5, %%mm7           \n\t"\
637
        "pxor %%mm6, %%mm4              \n\t"\
638
        "pxor %%mm7, %%mm5              \n\t"\
639
        "psubw %%mm6, %%mm4             \n\t"\
640
        "psubw %%mm7, %%mm5             \n\t"\
641
        "psrlw $3, %%mm4                \n\t"\
642
        "psrlw $3, %%mm5                \n\t"\
643
        "packuswb %%mm5, %%mm4          \n\t"\
644
        "packsswb %%mm7, %%mm6          \n\t"\
645
        "pxor %%mm7, %%mm7              \n\t"\
646
        "movd %4, %%mm2                 \n\t"\
647
        "punpcklbw %%mm2, %%mm2         \n\t"\
648
        "punpcklbw %%mm2, %%mm2         \n\t"\
649
        "punpcklbw %%mm2, %%mm2         \n\t"\
650
        "psubusb %%mm4, %%mm2           \n\t"\
651
        "movq %%mm2, %%mm3              \n\t"\
652
        "psubusb %%mm4, %%mm3           \n\t"\
653
        "psubb %%mm3, %%mm2             \n\t"\
654
        "movq %1, %%mm3                 \n\t"\
655
        "movq %2, %%mm4                 \n\t"\
656
        "pxor %%mm6, %%mm3              \n\t"\
657
        "pxor %%mm6, %%mm4              \n\t"\
658
        "paddusb %%mm2, %%mm3           \n\t"\
659
        "psubusb %%mm2, %%mm4           \n\t"\
660
        "pxor %%mm6, %%mm3              \n\t"\
661
        "pxor %%mm6, %%mm4              \n\t"\
662
        "paddusb %%mm2, %%mm2           \n\t"\
663
        "packsswb %%mm1, %%mm0          \n\t"\
664
        "pcmpgtb %%mm0, %%mm7           \n\t"\
665
        "pxor %%mm7, %%mm0              \n\t"\
666
        "psubb %%mm7, %%mm0             \n\t"\
667
        "movq %%mm0, %%mm1              \n\t"\
668
        "psubusb %%mm2, %%mm0           \n\t"\
669
        "psubb %%mm0, %%mm1             \n\t"\
670
        "pand %5, %%mm1                 \n\t"\
671
        "psrlw $2, %%mm1                \n\t"\
672
        "pxor %%mm7, %%mm1              \n\t"\
673
        "psubb %%mm7, %%mm1             \n\t"\
674
        "movq %0, %%mm5                 \n\t"\
675
        "movq %3, %%mm6                 \n\t"\
676
        "psubb %%mm1, %%mm5             \n\t"\
677
        "paddb %%mm1, %%mm6             \n\t"
678

    
679
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
680
    if(ENABLE_ANY_H263) {
681
    const int strength= ff_h263_loop_filter_strength[qscale];
682

    
683
    asm volatile(
684

    
685
        H263_LOOP_FILTER
686

    
687
        "movq %%mm3, %1                 \n\t"
688
        "movq %%mm4, %2                 \n\t"
689
        "movq %%mm5, %0                 \n\t"
690
        "movq %%mm6, %3                 \n\t"
691
        : "+m" (*(uint64_t*)(src - 2*stride)),
692
          "+m" (*(uint64_t*)(src - 1*stride)),
693
          "+m" (*(uint64_t*)(src + 0*stride)),
694
          "+m" (*(uint64_t*)(src + 1*stride))
695
        : "g" (2*strength), "m"(ff_pb_FC)
696
    );
697
    }
698
}
699

    
700
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
701
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
702
        "movd  %4, %%mm0                \n\t"
703
        "movd  %5, %%mm1                \n\t"
704
        "movd  %6, %%mm2                \n\t"
705
        "movd  %7, %%mm3                \n\t"
706
        "punpcklbw %%mm1, %%mm0         \n\t"
707
        "punpcklbw %%mm3, %%mm2         \n\t"
708
        "movq %%mm0, %%mm1              \n\t"
709
        "punpcklwd %%mm2, %%mm0         \n\t"
710
        "punpckhwd %%mm2, %%mm1         \n\t"
711
        "movd  %%mm0, %0                \n\t"
712
        "punpckhdq %%mm0, %%mm0         \n\t"
713
        "movd  %%mm0, %1                \n\t"
714
        "movd  %%mm1, %2                \n\t"
715
        "punpckhdq %%mm1, %%mm1         \n\t"
716
        "movd  %%mm1, %3                \n\t"
717

    
718
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
719
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
720
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
721
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
722
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
723
           "m" (*(uint32_t*)(src + 1*src_stride)),
724
           "m" (*(uint32_t*)(src + 2*src_stride)),
725
           "m" (*(uint32_t*)(src + 3*src_stride))
726
    );
727
}
728

    
729
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
730
    if(ENABLE_ANY_H263) {
731
    const int strength= ff_h263_loop_filter_strength[qscale];
732
    DECLARE_ALIGNED(8, uint64_t, temp[4]);
733
    uint8_t *btemp= (uint8_t*)temp;
734

    
735
    src -= 2;
736

    
737
    transpose4x4(btemp  , src           , 8, stride);
738
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
739
    asm volatile(
740
        H263_LOOP_FILTER // 5 3 4 6
741

    
742
        : "+m" (temp[0]),
743
          "+m" (temp[1]),
744
          "+m" (temp[2]),
745
          "+m" (temp[3])
746
        : "g" (2*strength), "m"(ff_pb_FC)
747
    );
748

    
749
    asm volatile(
750
        "movq %%mm5, %%mm1              \n\t"
751
        "movq %%mm4, %%mm0              \n\t"
752
        "punpcklbw %%mm3, %%mm5         \n\t"
753
        "punpcklbw %%mm6, %%mm4         \n\t"
754
        "punpckhbw %%mm3, %%mm1         \n\t"
755
        "punpckhbw %%mm6, %%mm0         \n\t"
756
        "movq %%mm5, %%mm3              \n\t"
757
        "movq %%mm1, %%mm6              \n\t"
758
        "punpcklwd %%mm4, %%mm5         \n\t"
759
        "punpcklwd %%mm0, %%mm1         \n\t"
760
        "punpckhwd %%mm4, %%mm3         \n\t"
761
        "punpckhwd %%mm0, %%mm6         \n\t"
762
        "movd %%mm5, (%0)               \n\t"
763
        "punpckhdq %%mm5, %%mm5         \n\t"
764
        "movd %%mm5, (%0,%2)            \n\t"
765
        "movd %%mm3, (%0,%2,2)          \n\t"
766
        "punpckhdq %%mm3, %%mm3         \n\t"
767
        "movd %%mm3, (%0,%3)            \n\t"
768
        "movd %%mm1, (%1)               \n\t"
769
        "punpckhdq %%mm1, %%mm1         \n\t"
770
        "movd %%mm1, (%1,%2)            \n\t"
771
        "movd %%mm6, (%1,%2,2)          \n\t"
772
        "punpckhdq %%mm6, %%mm6         \n\t"
773
        "movd %%mm6, (%1,%3)            \n\t"
774
        :: "r" (src),
775
           "r" (src + 4*stride),
776
           "r" ((long)   stride ),
777
           "r" ((long)(3*stride))
778
    );
779
    }
780
}
781

    
782
#ifdef CONFIG_ENCODERS
783
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
784
    int tmp;
785
  asm volatile (
786
      "movl $16,%%ecx\n"
787
      "pxor %%mm0,%%mm0\n"
788
      "pxor %%mm7,%%mm7\n"
789
      "1:\n"
790
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
791
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
792

    
793
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
794

    
795
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
796
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
797

    
798
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
799
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
800
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
801

    
802
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
803
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
804

    
805
      "pmaddwd %%mm3,%%mm3\n"
806
      "pmaddwd %%mm4,%%mm4\n"
807

    
808
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
809
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
810
      "paddd %%mm3,%%mm4\n"
811
      "paddd %%mm2,%%mm7\n"
812

    
813
      "add %2, %0\n"
814
      "paddd %%mm4,%%mm7\n"
815
      "dec %%ecx\n"
816
      "jnz 1b\n"
817

    
818
      "movq %%mm7,%%mm1\n"
819
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
820
      "paddd %%mm7,%%mm1\n"
821
      "movd %%mm1,%1\n"
822
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
823
    return tmp;
824
}
825

    
826
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
827
    int tmp;
828
  asm volatile (
829
      "movl %4,%%ecx\n"
830
      "shr $1,%%ecx\n"
831
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
832
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
833
      "1:\n"
834
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
835
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
836
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
837
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
838

    
839
      /* todo: mm1-mm2, mm3-mm4 */
840
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
841
      /*       OR the results to get absolute difference */
842
      "movq %%mm1,%%mm5\n"
843
      "movq %%mm3,%%mm6\n"
844
      "psubusb %%mm2,%%mm1\n"
845
      "psubusb %%mm4,%%mm3\n"
846
      "psubusb %%mm5,%%mm2\n"
847
      "psubusb %%mm6,%%mm4\n"
848

    
849
      "por %%mm1,%%mm2\n"
850
      "por %%mm3,%%mm4\n"
851

    
852
      /* now convert to 16-bit vectors so we can square them */
853
      "movq %%mm2,%%mm1\n"
854
      "movq %%mm4,%%mm3\n"
855

    
856
      "punpckhbw %%mm0,%%mm2\n"
857
      "punpckhbw %%mm0,%%mm4\n"
858
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
859
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
860

    
861
      "pmaddwd %%mm2,%%mm2\n"
862
      "pmaddwd %%mm4,%%mm4\n"
863
      "pmaddwd %%mm1,%%mm1\n"
864
      "pmaddwd %%mm3,%%mm3\n"
865

    
866
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
867
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
868

    
869
      "paddd %%mm2,%%mm1\n"
870
      "paddd %%mm4,%%mm3\n"
871
      "paddd %%mm1,%%mm7\n"
872
      "paddd %%mm3,%%mm7\n"
873

    
874
      "decl %%ecx\n"
875
      "jnz 1b\n"
876

    
877
      "movq %%mm7,%%mm1\n"
878
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
879
      "paddd %%mm7,%%mm1\n"
880
      "movd %%mm1,%2\n"
881
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
882
      : "r" ((long)line_size) , "m" (h)
883
      : "%ecx");
884
    return tmp;
885
}
886

    
887
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
888
    int tmp;
889
  asm volatile (
890
      "movl %4,%%ecx\n"
891
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
892
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
893
      "1:\n"
894
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
895
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
896
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
897
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
898

    
899
      /* todo: mm1-mm2, mm3-mm4 */
900
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
901
      /*       OR the results to get absolute difference */
902
      "movq %%mm1,%%mm5\n"
903
      "movq %%mm3,%%mm6\n"
904
      "psubusb %%mm2,%%mm1\n"
905
      "psubusb %%mm4,%%mm3\n"
906
      "psubusb %%mm5,%%mm2\n"
907
      "psubusb %%mm6,%%mm4\n"
908

    
909
      "por %%mm1,%%mm2\n"
910
      "por %%mm3,%%mm4\n"
911

    
912
      /* now convert to 16-bit vectors so we can square them */
913
      "movq %%mm2,%%mm1\n"
914
      "movq %%mm4,%%mm3\n"
915

    
916
      "punpckhbw %%mm0,%%mm2\n"
917
      "punpckhbw %%mm0,%%mm4\n"
918
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
919
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
920

    
921
      "pmaddwd %%mm2,%%mm2\n"
922
      "pmaddwd %%mm4,%%mm4\n"
923
      "pmaddwd %%mm1,%%mm1\n"
924
      "pmaddwd %%mm3,%%mm3\n"
925

    
926
      "add %3,%0\n"
927
      "add %3,%1\n"
928

    
929
      "paddd %%mm2,%%mm1\n"
930
      "paddd %%mm4,%%mm3\n"
931
      "paddd %%mm1,%%mm7\n"
932
      "paddd %%mm3,%%mm7\n"
933

    
934
      "decl %%ecx\n"
935
      "jnz 1b\n"
936

    
937
      "movq %%mm7,%%mm1\n"
938
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
939
      "paddd %%mm7,%%mm1\n"
940
      "movd %%mm1,%2\n"
941
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
942
      : "r" ((long)line_size) , "m" (h)
943
      : "%ecx");
944
    return tmp;
945
}
946

    
947
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
948
    int tmp;
949
  asm volatile (
950
      "shr $1,%2\n"
951
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
952
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
953
      "1:\n"
954
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
955
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
956
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
957
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
958

    
959
      /* todo: mm1-mm2, mm3-mm4 */
960
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
961
      /*       OR the results to get absolute difference */
962
      "movdqa %%xmm1,%%xmm5\n"
963
      "movdqa %%xmm3,%%xmm6\n"
964
      "psubusb %%xmm2,%%xmm1\n"
965
      "psubusb %%xmm4,%%xmm3\n"
966
      "psubusb %%xmm5,%%xmm2\n"
967
      "psubusb %%xmm6,%%xmm4\n"
968

    
969
      "por %%xmm1,%%xmm2\n"
970
      "por %%xmm3,%%xmm4\n"
971

    
972
      /* now convert to 16-bit vectors so we can square them */
973
      "movdqa %%xmm2,%%xmm1\n"
974
      "movdqa %%xmm4,%%xmm3\n"
975

    
976
      "punpckhbw %%xmm0,%%xmm2\n"
977
      "punpckhbw %%xmm0,%%xmm4\n"
978
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
979
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
980

    
981
      "pmaddwd %%xmm2,%%xmm2\n"
982
      "pmaddwd %%xmm4,%%xmm4\n"
983
      "pmaddwd %%xmm1,%%xmm1\n"
984
      "pmaddwd %%xmm3,%%xmm3\n"
985

    
986
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
987
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
988

    
989
      "paddd %%xmm2,%%xmm1\n"
990
      "paddd %%xmm4,%%xmm3\n"
991
      "paddd %%xmm1,%%xmm7\n"
992
      "paddd %%xmm3,%%xmm7\n"
993

    
994
      "decl %2\n"
995
      "jnz 1b\n"
996

    
997
      "movdqa %%xmm7,%%xmm1\n"
998
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
999
      "paddd %%xmm1,%%xmm7\n"
1000
      "movdqa %%xmm7,%%xmm1\n"
1001
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
1002
      "paddd %%xmm1,%%xmm7\n"
1003
      "movd %%xmm7,%3\n"
1004
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
1005
      : "r" ((long)line_size));
1006
    return tmp;
1007
}
1008

    
1009
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
1010
    int tmp;
1011
  asm volatile (
1012
      "movl %3,%%ecx\n"
1013
      "pxor %%mm7,%%mm7\n"
1014
      "pxor %%mm6,%%mm6\n"
1015

    
1016
      "movq (%0),%%mm0\n"
1017
      "movq %%mm0, %%mm1\n"
1018
      "psllq $8, %%mm0\n"
1019
      "psrlq $8, %%mm1\n"
1020
      "psrlq $8, %%mm0\n"
1021
      "movq %%mm0, %%mm2\n"
1022
      "movq %%mm1, %%mm3\n"
1023
      "punpcklbw %%mm7,%%mm0\n"
1024
      "punpcklbw %%mm7,%%mm1\n"
1025
      "punpckhbw %%mm7,%%mm2\n"
1026
      "punpckhbw %%mm7,%%mm3\n"
1027
      "psubw %%mm1, %%mm0\n"
1028
      "psubw %%mm3, %%mm2\n"
1029

    
1030
      "add %2,%0\n"
1031

    
1032
      "movq (%0),%%mm4\n"
1033
      "movq %%mm4, %%mm1\n"
1034
      "psllq $8, %%mm4\n"
1035
      "psrlq $8, %%mm1\n"
1036
      "psrlq $8, %%mm4\n"
1037
      "movq %%mm4, %%mm5\n"
1038
      "movq %%mm1, %%mm3\n"
1039
      "punpcklbw %%mm7,%%mm4\n"
1040
      "punpcklbw %%mm7,%%mm1\n"
1041
      "punpckhbw %%mm7,%%mm5\n"
1042
      "punpckhbw %%mm7,%%mm3\n"
1043
      "psubw %%mm1, %%mm4\n"
1044
      "psubw %%mm3, %%mm5\n"
1045
      "psubw %%mm4, %%mm0\n"
1046
      "psubw %%mm5, %%mm2\n"
1047
      "pxor %%mm3, %%mm3\n"
1048
      "pxor %%mm1, %%mm1\n"
1049
      "pcmpgtw %%mm0, %%mm3\n\t"
1050
      "pcmpgtw %%mm2, %%mm1\n\t"
1051
      "pxor %%mm3, %%mm0\n"
1052
      "pxor %%mm1, %%mm2\n"
1053
      "psubw %%mm3, %%mm0\n"
1054
      "psubw %%mm1, %%mm2\n"
1055
      "paddw %%mm0, %%mm2\n"
1056
      "paddw %%mm2, %%mm6\n"
1057

    
1058
      "add %2,%0\n"
1059
      "1:\n"
1060

    
1061
      "movq (%0),%%mm0\n"
1062
      "movq %%mm0, %%mm1\n"
1063
      "psllq $8, %%mm0\n"
1064
      "psrlq $8, %%mm1\n"
1065
      "psrlq $8, %%mm0\n"
1066
      "movq %%mm0, %%mm2\n"
1067
      "movq %%mm1, %%mm3\n"
1068
      "punpcklbw %%mm7,%%mm0\n"
1069
      "punpcklbw %%mm7,%%mm1\n"
1070
      "punpckhbw %%mm7,%%mm2\n"
1071
      "punpckhbw %%mm7,%%mm3\n"
1072
      "psubw %%mm1, %%mm0\n"
1073
      "psubw %%mm3, %%mm2\n"
1074
      "psubw %%mm0, %%mm4\n"
1075
      "psubw %%mm2, %%mm5\n"
1076
      "pxor %%mm3, %%mm3\n"
1077
      "pxor %%mm1, %%mm1\n"
1078
      "pcmpgtw %%mm4, %%mm3\n\t"
1079
      "pcmpgtw %%mm5, %%mm1\n\t"
1080
      "pxor %%mm3, %%mm4\n"
1081
      "pxor %%mm1, %%mm5\n"
1082
      "psubw %%mm3, %%mm4\n"
1083
      "psubw %%mm1, %%mm5\n"
1084
      "paddw %%mm4, %%mm5\n"
1085
      "paddw %%mm5, %%mm6\n"
1086

    
1087
      "add %2,%0\n"
1088

    
1089
      "movq (%0),%%mm4\n"
1090
      "movq %%mm4, %%mm1\n"
1091
      "psllq $8, %%mm4\n"
1092
      "psrlq $8, %%mm1\n"
1093
      "psrlq $8, %%mm4\n"
1094
      "movq %%mm4, %%mm5\n"
1095
      "movq %%mm1, %%mm3\n"
1096
      "punpcklbw %%mm7,%%mm4\n"
1097
      "punpcklbw %%mm7,%%mm1\n"
1098
      "punpckhbw %%mm7,%%mm5\n"
1099
      "punpckhbw %%mm7,%%mm3\n"
1100
      "psubw %%mm1, %%mm4\n"
1101
      "psubw %%mm3, %%mm5\n"
1102
      "psubw %%mm4, %%mm0\n"
1103
      "psubw %%mm5, %%mm2\n"
1104
      "pxor %%mm3, %%mm3\n"
1105
      "pxor %%mm1, %%mm1\n"
1106
      "pcmpgtw %%mm0, %%mm3\n\t"
1107
      "pcmpgtw %%mm2, %%mm1\n\t"
1108
      "pxor %%mm3, %%mm0\n"
1109
      "pxor %%mm1, %%mm2\n"
1110
      "psubw %%mm3, %%mm0\n"
1111
      "psubw %%mm1, %%mm2\n"
1112
      "paddw %%mm0, %%mm2\n"
1113
      "paddw %%mm2, %%mm6\n"
1114

    
1115
      "add %2,%0\n"
1116
      "subl $2, %%ecx\n"
1117
      " jnz 1b\n"
1118

    
1119
      "movq %%mm6, %%mm0\n"
1120
      "punpcklwd %%mm7,%%mm0\n"
1121
      "punpckhwd %%mm7,%%mm6\n"
1122
      "paddd %%mm0, %%mm6\n"
1123

    
1124
      "movq %%mm6,%%mm0\n"
1125
      "psrlq $32, %%mm6\n"
1126
      "paddd %%mm6,%%mm0\n"
1127
      "movd %%mm0,%1\n"
1128
      : "+r" (pix1), "=r"(tmp)
1129
      : "r" ((long)line_size) , "g" (h-2)
1130
      : "%ecx");
1131
      return tmp;
1132
}
1133

    
1134
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1135
    int tmp;
1136
    uint8_t * pix= pix1;
1137
  asm volatile (
1138
      "movl %3,%%ecx\n"
1139
      "pxor %%mm7,%%mm7\n"
1140
      "pxor %%mm6,%%mm6\n"
1141

    
1142
      "movq (%0),%%mm0\n"
1143
      "movq 1(%0),%%mm1\n"
1144
      "movq %%mm0, %%mm2\n"
1145
      "movq %%mm1, %%mm3\n"
1146
      "punpcklbw %%mm7,%%mm0\n"
1147
      "punpcklbw %%mm7,%%mm1\n"
1148
      "punpckhbw %%mm7,%%mm2\n"
1149
      "punpckhbw %%mm7,%%mm3\n"
1150
      "psubw %%mm1, %%mm0\n"
1151
      "psubw %%mm3, %%mm2\n"
1152

    
1153
      "add %2,%0\n"
1154

    
1155
      "movq (%0),%%mm4\n"
1156
      "movq 1(%0),%%mm1\n"
1157
      "movq %%mm4, %%mm5\n"
1158
      "movq %%mm1, %%mm3\n"
1159
      "punpcklbw %%mm7,%%mm4\n"
1160
      "punpcklbw %%mm7,%%mm1\n"
1161
      "punpckhbw %%mm7,%%mm5\n"
1162
      "punpckhbw %%mm7,%%mm3\n"
1163
      "psubw %%mm1, %%mm4\n"
1164
      "psubw %%mm3, %%mm5\n"
1165
      "psubw %%mm4, %%mm0\n"
1166
      "psubw %%mm5, %%mm2\n"
1167
      "pxor %%mm3, %%mm3\n"
1168
      "pxor %%mm1, %%mm1\n"
1169
      "pcmpgtw %%mm0, %%mm3\n\t"
1170
      "pcmpgtw %%mm2, %%mm1\n\t"
1171
      "pxor %%mm3, %%mm0\n"
1172
      "pxor %%mm1, %%mm2\n"
1173
      "psubw %%mm3, %%mm0\n"
1174
      "psubw %%mm1, %%mm2\n"
1175
      "paddw %%mm0, %%mm2\n"
1176
      "paddw %%mm2, %%mm6\n"
1177

    
1178
      "add %2,%0\n"
1179
      "1:\n"
1180

    
1181
      "movq (%0),%%mm0\n"
1182
      "movq 1(%0),%%mm1\n"
1183
      "movq %%mm0, %%mm2\n"
1184
      "movq %%mm1, %%mm3\n"
1185
      "punpcklbw %%mm7,%%mm0\n"
1186
      "punpcklbw %%mm7,%%mm1\n"
1187
      "punpckhbw %%mm7,%%mm2\n"
1188
      "punpckhbw %%mm7,%%mm3\n"
1189
      "psubw %%mm1, %%mm0\n"
1190
      "psubw %%mm3, %%mm2\n"
1191
      "psubw %%mm0, %%mm4\n"
1192
      "psubw %%mm2, %%mm5\n"
1193
      "pxor %%mm3, %%mm3\n"
1194
      "pxor %%mm1, %%mm1\n"
1195
      "pcmpgtw %%mm4, %%mm3\n\t"
1196
      "pcmpgtw %%mm5, %%mm1\n\t"
1197
      "pxor %%mm3, %%mm4\n"
1198
      "pxor %%mm1, %%mm5\n"
1199
      "psubw %%mm3, %%mm4\n"
1200
      "psubw %%mm1, %%mm5\n"
1201
      "paddw %%mm4, %%mm5\n"
1202
      "paddw %%mm5, %%mm6\n"
1203

    
1204
      "add %2,%0\n"
1205

    
1206
      "movq (%0),%%mm4\n"
1207
      "movq 1(%0),%%mm1\n"
1208
      "movq %%mm4, %%mm5\n"
1209
      "movq %%mm1, %%mm3\n"
1210
      "punpcklbw %%mm7,%%mm4\n"
1211
      "punpcklbw %%mm7,%%mm1\n"
1212
      "punpckhbw %%mm7,%%mm5\n"
1213
      "punpckhbw %%mm7,%%mm3\n"
1214
      "psubw %%mm1, %%mm4\n"
1215
      "psubw %%mm3, %%mm5\n"
1216
      "psubw %%mm4, %%mm0\n"
1217
      "psubw %%mm5, %%mm2\n"
1218
      "pxor %%mm3, %%mm3\n"
1219
      "pxor %%mm1, %%mm1\n"
1220
      "pcmpgtw %%mm0, %%mm3\n\t"
1221
      "pcmpgtw %%mm2, %%mm1\n\t"
1222
      "pxor %%mm3, %%mm0\n"
1223
      "pxor %%mm1, %%mm2\n"
1224
      "psubw %%mm3, %%mm0\n"
1225
      "psubw %%mm1, %%mm2\n"
1226
      "paddw %%mm0, %%mm2\n"
1227
      "paddw %%mm2, %%mm6\n"
1228

    
1229
      "add %2,%0\n"
1230
      "subl $2, %%ecx\n"
1231
      " jnz 1b\n"
1232

    
1233
      "movq %%mm6, %%mm0\n"
1234
      "punpcklwd %%mm7,%%mm0\n"
1235
      "punpckhwd %%mm7,%%mm6\n"
1236
      "paddd %%mm0, %%mm6\n"
1237

    
1238
      "movq %%mm6,%%mm0\n"
1239
      "psrlq $32, %%mm6\n"
1240
      "paddd %%mm6,%%mm0\n"
1241
      "movd %%mm0,%1\n"
1242
      : "+r" (pix1), "=r"(tmp)
1243
      : "r" ((long)line_size) , "g" (h-2)
1244
      : "%ecx");
1245
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1246
}
1247

    
1248
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1249
    MpegEncContext *c = p;
1250
    int score1, score2;
1251

    
1252
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1253
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1254
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1255

    
1256
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1257
    else  return score1 + FFABS(score2)*8;
1258
}
1259

    
1260
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1261
    MpegEncContext *c = p;
1262
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1263
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1264

    
1265
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1266
    else  return score1 + FFABS(score2)*8;
1267
}
1268

    
1269
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1270
    int tmp;
1271

    
1272
    assert( (((int)pix) & 7) == 0);
1273
    assert((line_size &7) ==0);
1274

    
1275
#define SUM(in0, in1, out0, out1) \
1276
      "movq (%0), %%mm2\n"\
1277
      "movq 8(%0), %%mm3\n"\
1278
      "add %2,%0\n"\
1279
      "movq %%mm2, " #out0 "\n"\
1280
      "movq %%mm3, " #out1 "\n"\
1281
      "psubusb " #in0 ", %%mm2\n"\
1282
      "psubusb " #in1 ", %%mm3\n"\
1283
      "psubusb " #out0 ", " #in0 "\n"\
1284
      "psubusb " #out1 ", " #in1 "\n"\
1285
      "por %%mm2, " #in0 "\n"\
1286
      "por %%mm3, " #in1 "\n"\
1287
      "movq " #in0 ", %%mm2\n"\
1288
      "movq " #in1 ", %%mm3\n"\
1289
      "punpcklbw %%mm7, " #in0 "\n"\
1290
      "punpcklbw %%mm7, " #in1 "\n"\
1291
      "punpckhbw %%mm7, %%mm2\n"\
1292
      "punpckhbw %%mm7, %%mm3\n"\
1293
      "paddw " #in1 ", " #in0 "\n"\
1294
      "paddw %%mm3, %%mm2\n"\
1295
      "paddw %%mm2, " #in0 "\n"\
1296
      "paddw " #in0 ", %%mm6\n"
1297

    
1298

    
1299
  asm volatile (
1300
      "movl %3,%%ecx\n"
1301
      "pxor %%mm6,%%mm6\n"
1302
      "pxor %%mm7,%%mm7\n"
1303
      "movq (%0),%%mm0\n"
1304
      "movq 8(%0),%%mm1\n"
1305
      "add %2,%0\n"
1306
      "subl $2, %%ecx\n"
1307
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1308
      "1:\n"
1309

    
1310
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1311

    
1312
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1313

    
1314
      "subl $2, %%ecx\n"
1315
      "jnz 1b\n"
1316

    
1317
      "movq %%mm6,%%mm0\n"
1318
      "psrlq $32, %%mm6\n"
1319
      "paddw %%mm6,%%mm0\n"
1320
      "movq %%mm0,%%mm6\n"
1321
      "psrlq $16, %%mm0\n"
1322
      "paddw %%mm6,%%mm0\n"
1323
      "movd %%mm0,%1\n"
1324
      : "+r" (pix), "=r"(tmp)
1325
      : "r" ((long)line_size) , "m" (h)
1326
      : "%ecx");
1327
    return tmp & 0xFFFF;
1328
}
1329
#undef SUM
1330

    
1331
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1332
    int tmp;
1333

    
1334
    assert( (((int)pix) & 7) == 0);
1335
    assert((line_size &7) ==0);
1336

    
1337
#define SUM(in0, in1, out0, out1) \
1338
      "movq (%0), " #out0 "\n"\
1339
      "movq 8(%0), " #out1 "\n"\
1340
      "add %2,%0\n"\
1341
      "psadbw " #out0 ", " #in0 "\n"\
1342
      "psadbw " #out1 ", " #in1 "\n"\
1343
      "paddw " #in1 ", " #in0 "\n"\
1344
      "paddw " #in0 ", %%mm6\n"
1345

    
1346
  asm volatile (
1347
      "movl %3,%%ecx\n"
1348
      "pxor %%mm6,%%mm6\n"
1349
      "pxor %%mm7,%%mm7\n"
1350
      "movq (%0),%%mm0\n"
1351
      "movq 8(%0),%%mm1\n"
1352
      "add %2,%0\n"
1353
      "subl $2, %%ecx\n"
1354
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1355
      "1:\n"
1356

    
1357
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1358

    
1359
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1360

    
1361
      "subl $2, %%ecx\n"
1362
      "jnz 1b\n"
1363

    
1364
      "movd %%mm6,%1\n"
1365
      : "+r" (pix), "=r"(tmp)
1366
      : "r" ((long)line_size) , "m" (h)
1367
      : "%ecx");
1368
    return tmp;
1369
}
1370
#undef SUM
1371

    
1372
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1373
    int tmp;
1374

    
1375
    assert( (((int)pix1) & 7) == 0);
1376
    assert( (((int)pix2) & 7) == 0);
1377
    assert((line_size &7) ==0);
1378

    
1379
#define SUM(in0, in1, out0, out1) \
1380
      "movq (%0),%%mm2\n"\
1381
      "movq (%1)," #out0 "\n"\
1382
      "movq 8(%0),%%mm3\n"\
1383
      "movq 8(%1)," #out1 "\n"\
1384
      "add %3,%0\n"\
1385
      "add %3,%1\n"\
1386
      "psubb " #out0 ", %%mm2\n"\
1387
      "psubb " #out1 ", %%mm3\n"\
1388
      "pxor %%mm7, %%mm2\n"\
1389
      "pxor %%mm7, %%mm3\n"\
1390
      "movq %%mm2, " #out0 "\n"\
1391
      "movq %%mm3, " #out1 "\n"\
1392
      "psubusb " #in0 ", %%mm2\n"\
1393
      "psubusb " #in1 ", %%mm3\n"\
1394
      "psubusb " #out0 ", " #in0 "\n"\
1395
      "psubusb " #out1 ", " #in1 "\n"\
1396
      "por %%mm2, " #in0 "\n"\
1397
      "por %%mm3, " #in1 "\n"\
1398
      "movq " #in0 ", %%mm2\n"\
1399
      "movq " #in1 ", %%mm3\n"\
1400
      "punpcklbw %%mm7, " #in0 "\n"\
1401
      "punpcklbw %%mm7, " #in1 "\n"\
1402
      "punpckhbw %%mm7, %%mm2\n"\
1403
      "punpckhbw %%mm7, %%mm3\n"\
1404
      "paddw " #in1 ", " #in0 "\n"\
1405
      "paddw %%mm3, %%mm2\n"\
1406
      "paddw %%mm2, " #in0 "\n"\
1407
      "paddw " #in0 ", %%mm6\n"
1408

    
1409

    
1410
  asm volatile (
1411
      "movl %4,%%ecx\n"
1412
      "pxor %%mm6,%%mm6\n"
1413
      "pcmpeqw %%mm7,%%mm7\n"
1414
      "psllw $15, %%mm7\n"
1415
      "packsswb %%mm7, %%mm7\n"
1416
      "movq (%0),%%mm0\n"
1417
      "movq (%1),%%mm2\n"
1418
      "movq 8(%0),%%mm1\n"
1419
      "movq 8(%1),%%mm3\n"
1420
      "add %3,%0\n"
1421
      "add %3,%1\n"
1422
      "subl $2, %%ecx\n"
1423
      "psubb %%mm2, %%mm0\n"
1424
      "psubb %%mm3, %%mm1\n"
1425
      "pxor %%mm7, %%mm0\n"
1426
      "pxor %%mm7, %%mm1\n"
1427
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1428
      "1:\n"
1429

    
1430
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1431

    
1432
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1433

    
1434
      "subl $2, %%ecx\n"
1435
      "jnz 1b\n"
1436

    
1437
      "movq %%mm6,%%mm0\n"
1438
      "psrlq $32, %%mm6\n"
1439
      "paddw %%mm6,%%mm0\n"
1440
      "movq %%mm0,%%mm6\n"
1441
      "psrlq $16, %%mm0\n"
1442
      "paddw %%mm6,%%mm0\n"
1443
      "movd %%mm0,%2\n"
1444
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1445
      : "r" ((long)line_size) , "m" (h)
1446
      : "%ecx");
1447
    return tmp & 0x7FFF;
1448
}
1449
#undef SUM
1450

    
1451
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1452
    int tmp;
1453

    
1454
    assert( (((int)pix1) & 7) == 0);
1455
    assert( (((int)pix2) & 7) == 0);
1456
    assert((line_size &7) ==0);
1457

    
1458
#define SUM(in0, in1, out0, out1) \
1459
      "movq (%0)," #out0 "\n"\
1460
      "movq (%1),%%mm2\n"\
1461
      "movq 8(%0)," #out1 "\n"\
1462
      "movq 8(%1),%%mm3\n"\
1463
      "add %3,%0\n"\
1464
      "add %3,%1\n"\
1465
      "psubb %%mm2, " #out0 "\n"\
1466
      "psubb %%mm3, " #out1 "\n"\
1467
      "pxor %%mm7, " #out0 "\n"\
1468
      "pxor %%mm7, " #out1 "\n"\
1469
      "psadbw " #out0 ", " #in0 "\n"\
1470
      "psadbw " #out1 ", " #in1 "\n"\
1471
      "paddw " #in1 ", " #in0 "\n"\
1472
      "paddw " #in0 ", %%mm6\n"
1473

    
1474
  asm volatile (
1475
      "movl %4,%%ecx\n"
1476
      "pxor %%mm6,%%mm6\n"
1477
      "pcmpeqw %%mm7,%%mm7\n"
1478
      "psllw $15, %%mm7\n"
1479
      "packsswb %%mm7, %%mm7\n"
1480
      "movq (%0),%%mm0\n"
1481
      "movq (%1),%%mm2\n"
1482
      "movq 8(%0),%%mm1\n"
1483
      "movq 8(%1),%%mm3\n"
1484
      "add %3,%0\n"
1485
      "add %3,%1\n"
1486
      "subl $2, %%ecx\n"
1487
      "psubb %%mm2, %%mm0\n"
1488
      "psubb %%mm3, %%mm1\n"
1489
      "pxor %%mm7, %%mm0\n"
1490
      "pxor %%mm7, %%mm1\n"
1491
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1492
      "1:\n"
1493

    
1494
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1495

    
1496
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1497

    
1498
      "subl $2, %%ecx\n"
1499
      "jnz 1b\n"
1500

    
1501
      "movd %%mm6,%2\n"
1502
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1503
      : "r" ((long)line_size) , "m" (h)
1504
      : "%ecx");
1505
    return tmp;
1506
}
1507
#undef SUM
1508

    
1509
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1510
    long i=0;
1511
    asm volatile(
1512
        "1:                             \n\t"
1513
        "movq  (%2, %0), %%mm0          \n\t"
1514
        "movq  (%1, %0), %%mm1          \n\t"
1515
        "psubb %%mm0, %%mm1             \n\t"
1516
        "movq %%mm1, (%3, %0)           \n\t"
1517
        "movq 8(%2, %0), %%mm0          \n\t"
1518
        "movq 8(%1, %0), %%mm1          \n\t"
1519
        "psubb %%mm0, %%mm1             \n\t"
1520
        "movq %%mm1, 8(%3, %0)          \n\t"
1521
        "add $16, %0                    \n\t"
1522
        "cmp %4, %0                     \n\t"
1523
        " jb 1b                         \n\t"
1524
        : "+r" (i)
1525
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1526
    );
1527
    for(; i<w; i++)
1528
        dst[i+0] = src1[i+0]-src2[i+0];
1529
}
1530

    
1531
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1532
    long i=0;
1533
    uint8_t l, lt;
1534

    
1535
    asm volatile(
1536
        "1:                             \n\t"
1537
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1538
        "movq  (%1, %0), %%mm1          \n\t" // T
1539
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1540
        "movq  (%2, %0), %%mm3          \n\t" // X
1541
        "movq %%mm2, %%mm4              \n\t" // L
1542
        "psubb %%mm0, %%mm2             \n\t"
1543
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1544
        "movq %%mm4, %%mm5              \n\t" // L
1545
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1546
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1547
        "pminub %%mm2, %%mm4            \n\t"
1548
        "pmaxub %%mm1, %%mm4            \n\t"
1549
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1550
        "movq %%mm3, (%3, %0)           \n\t"
1551
        "add $8, %0                     \n\t"
1552
        "cmp %4, %0                     \n\t"
1553
        " jb 1b                         \n\t"
1554
        : "+r" (i)
1555
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1556
    );
1557

    
1558
    l= *left;
1559
    lt= *left_top;
1560

    
1561
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1562

    
1563
    *left_top= src1[w-1];
1564
    *left    = src2[w-1];
1565
}
1566

    
1567
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1568
    "mov"#m" "#p1", "#a"              \n\t"\
1569
    "mov"#m" "#p2", "#t"              \n\t"\
1570
    "punpcklbw "#a", "#t"             \n\t"\
1571
    "punpcklbw "#a", "#a"             \n\t"\
1572
    "psubw     "#t", "#a"             \n\t"\
1573

    
1574
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1575
    uint8_t *p1b=p1, *p2b=p2;\
1576
    asm volatile(\
1577
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1578
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1579
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1580
        "add %4, %1                   \n\t"\
1581
        "add %4, %2                   \n\t"\
1582
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1583
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1584
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1585
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1586
        "mov"#m1" "#mm"0, %0          \n\t"\
1587
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1588
        "mov"#m1" %0, "#mm"0          \n\t"\
1589
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1590
        : "r"((long)stride), "r"((long)stride*3)\
1591
    );\
1592
}
1593
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1594

    
1595
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1596
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1597

    
1598
#define LBUTTERFLY2(a1,b1,a2,b2)\
1599
    "paddw " #b1 ", " #a1 "           \n\t"\
1600
    "paddw " #b2 ", " #a2 "           \n\t"\
1601
    "paddw " #b1 ", " #b1 "           \n\t"\
1602
    "paddw " #b2 ", " #b2 "           \n\t"\
1603
    "psubw " #a1 ", " #b1 "           \n\t"\
1604
    "psubw " #a2 ", " #b2 "           \n\t"
1605

    
1606
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1607
        LBUTTERFLY2(m0, m1, m2, m3)\
1608
        LBUTTERFLY2(m4, m5, m6, m7)\
1609
        LBUTTERFLY2(m0, m2, m1, m3)\
1610
        LBUTTERFLY2(m4, m6, m5, m7)\
1611
        LBUTTERFLY2(m0, m4, m1, m5)\
1612
        LBUTTERFLY2(m2, m6, m3, m7)\
1613

    
1614
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1615

    
1616
#define MMABS_MMX(a,z)\
1617
    "pxor " #z ", " #z "              \n\t"\
1618
    "pcmpgtw " #a ", " #z "           \n\t"\
1619
    "pxor " #z ", " #a "              \n\t"\
1620
    "psubw " #z ", " #a "             \n\t"
1621

    
1622
#define MMABS_MMX2(a,z)\
1623
    "pxor " #z ", " #z "              \n\t"\
1624
    "psubw " #a ", " #z "             \n\t"\
1625
    "pmaxsw " #z ", " #a "            \n\t"
1626

    
1627
#define MMABS_SSSE3(a,z)\
1628
    "pabsw " #a ", " #a "             \n\t"
1629

    
1630
#define MMABS_SUM(a,z, sum)\
1631
    MMABS(a,z)\
1632
    "paddusw " #a ", " #sum "         \n\t"
1633

    
1634
#define MMABS_SUM_8x8_NOSPILL\
1635
    MMABS(%%xmm0, %%xmm8)\
1636
    MMABS(%%xmm1, %%xmm9)\
1637
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1638
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1639
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1640
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1641
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1642
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1643
    "paddusw %%xmm1, %%xmm0           \n\t"
1644

    
1645
#ifdef ARCH_X86_64
1646
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1647
#else
1648
#define MMABS_SUM_8x8_SSE2\
1649
    "movdqa %%xmm7, (%1)              \n\t"\
1650
    MMABS(%%xmm0, %%xmm7)\
1651
    MMABS(%%xmm1, %%xmm7)\
1652
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1653
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1654
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1655
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1656
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1657
    "movdqa (%1), %%xmm2              \n\t"\
1658
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1659
    "paddusw %%xmm1, %%xmm0           \n\t"
1660
#endif
1661

    
1662
#define LOAD4(o, a, b, c, d)\
1663
    "movq "#o"(%1),    "#a"           \n\t"\
1664
    "movq "#o"+8(%1),  "#b"           \n\t"\
1665
    "movq "#o"+16(%1), "#c"           \n\t"\
1666
    "movq "#o"+24(%1), "#d"           \n\t"\
1667

    
1668
#define STORE4(o, a, b, c, d)\
1669
    "movq "#a", "#o"(%1)              \n\t"\
1670
    "movq "#b", "#o"+8(%1)            \n\t"\
1671
    "movq "#c", "#o"+16(%1)           \n\t"\
1672
    "movq "#d", "#o"+24(%1)           \n\t"\
1673

    
1674
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1675
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1676
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1677
#define HSUM_MMX(a, t, dst)\
1678
    "movq "#a", "#t"                  \n\t"\
1679
    "psrlq $32, "#a"                  \n\t"\
1680
    "paddusw "#t", "#a"               \n\t"\
1681
    "movq "#a", "#t"                  \n\t"\
1682
    "psrlq $16, "#a"                  \n\t"\
1683
    "paddusw "#t", "#a"               \n\t"\
1684
    "movd "#a", "#dst"                \n\t"\
1685

    
1686
#define HSUM_MMX2(a, t, dst)\
1687
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1688
    "paddusw "#t", "#a"               \n\t"\
1689
    "pshufw $0x01, "#a", "#t"         \n\t"\
1690
    "paddusw "#t", "#a"               \n\t"\
1691
    "movd "#a", "#dst"                \n\t"\
1692

    
1693
#define HSUM_SSE2(a, t, dst)\
1694
    "movhlps "#a", "#t"               \n\t"\
1695
    "paddusw "#t", "#a"               \n\t"\
1696
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1697
    "paddusw "#t", "#a"               \n\t"\
1698
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1699
    "paddusw "#t", "#a"               \n\t"\
1700
    "movd "#a", "#dst"                \n\t"\
1701

    
1702
#define HADAMARD8_DIFF_MMX(cpu) \
1703
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1704
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1705
    int sum;\
1706
\
1707
    assert(h==8);\
1708
\
1709
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1710
\
1711
    asm volatile(\
1712
        HADAMARD48\
1713
\
1714
        "movq %%mm7, 96(%1)             \n\t"\
1715
\
1716
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1717
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1718
\
1719
        "movq 96(%1), %%mm7             \n\t"\
1720
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1721
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1722
\
1723
        : "=r" (sum)\
1724
        : "r"(temp)\
1725
    );\
1726
\
1727
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1728
\
1729
    asm volatile(\
1730
        HADAMARD48\
1731
\
1732
        "movq %%mm7, 96(%1)             \n\t"\
1733
\
1734
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1735
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1736
\
1737
        "movq 96(%1), %%mm7             \n\t"\
1738
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1739
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1740
        "movq %%mm6, %%mm7              \n\t"\
1741
        "movq %%mm0, %%mm6              \n\t"\
1742
\
1743
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1744
\
1745
        HADAMARD48\
1746
        "movq %%mm7, 64(%1)             \n\t"\
1747
        MMABS(%%mm0, %%mm7)\
1748
        MMABS(%%mm1, %%mm7)\
1749
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1750
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1751
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1752
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1753
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1754
        "movq 64(%1), %%mm2             \n\t"\
1755
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1756
        "paddusw %%mm1, %%mm0           \n\t"\
1757
        "movq %%mm0, 64(%1)             \n\t"\
1758
\
1759
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1760
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1761
\
1762
        HADAMARD48\
1763
        "movq %%mm7, (%1)               \n\t"\
1764
        MMABS(%%mm0, %%mm7)\
1765
        MMABS(%%mm1, %%mm7)\
1766
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1767
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1768
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1769
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1770
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1771
        "movq (%1), %%mm2               \n\t"\
1772
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1773
        "paddusw 64(%1), %%mm0          \n\t"\
1774
        "paddusw %%mm1, %%mm0           \n\t"\
1775
\
1776
        HSUM(%%mm0, %%mm1, %0)\
1777
\
1778
        : "=r" (sum)\
1779
        : "r"(temp)\
1780
    );\
1781
    return sum&0xFFFF;\
1782
}\
1783
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1784

    
1785
#define HADAMARD8_DIFF_SSE2(cpu) \
1786
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1787
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1788
    int sum;\
1789
\
1790
    assert(h==8);\
1791
\
1792
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1793
\
1794
    asm volatile(\
1795
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1796
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1797
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1798
        MMABS_SUM_8x8\
1799
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1800
        : "=r" (sum)\
1801
        : "r"(temp)\
1802
    );\
1803
    return sum&0xFFFF;\
1804
}\
1805
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1806

    
1807
#define MMABS(a,z)         MMABS_MMX(a,z)
1808
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1809
HADAMARD8_DIFF_MMX(mmx)
1810
#undef MMABS
1811
#undef HSUM
1812

    
1813
#define MMABS(a,z)         MMABS_MMX2(a,z)
1814
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1815
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1816
HADAMARD8_DIFF_MMX(mmx2)
1817
HADAMARD8_DIFF_SSE2(sse2)
1818
#undef MMABS
1819
#undef MMABS_SUM_8x8
1820
#undef HSUM
1821

    
1822
#ifdef HAVE_SSSE3
1823
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1824
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1825
HADAMARD8_DIFF_SSE2(ssse3)
1826
#undef MMABS
1827
#undef MMABS_SUM_8x8
1828
#endif
1829

    
1830
#define DCT_SAD4(m,mm,o)\
1831
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1832
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1833
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1834
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1835
    MMABS_SUM(mm##2, mm##6, mm##0)\
1836
    MMABS_SUM(mm##3, mm##7, mm##1)\
1837
    MMABS_SUM(mm##4, mm##6, mm##0)\
1838
    MMABS_SUM(mm##5, mm##7, mm##1)\
1839

    
1840
#define DCT_SAD_MMX\
1841
    "pxor %%mm0, %%mm0                \n\t"\
1842
    "pxor %%mm1, %%mm1                \n\t"\
1843
    DCT_SAD4(q, %%mm, 0)\
1844
    DCT_SAD4(q, %%mm, 8)\
1845
    DCT_SAD4(q, %%mm, 64)\
1846
    DCT_SAD4(q, %%mm, 72)\
1847
    "paddusw %%mm1, %%mm0             \n\t"\
1848
    HSUM(%%mm0, %%mm1, %0)
1849

    
1850
#define DCT_SAD_SSE2\
1851
    "pxor %%xmm0, %%xmm0              \n\t"\
1852
    "pxor %%xmm1, %%xmm1              \n\t"\
1853
    DCT_SAD4(dqa, %%xmm, 0)\
1854
    DCT_SAD4(dqa, %%xmm, 64)\
1855
    "paddusw %%xmm1, %%xmm0           \n\t"\
1856
    HSUM(%%xmm0, %%xmm1, %0)
1857

    
1858
#define DCT_SAD_FUNC(cpu) \
1859
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1860
    int sum;\
1861
    asm volatile(\
1862
        DCT_SAD\
1863
        :"=r"(sum)\
1864
        :"r"(block)\
1865
    );\
1866
    return sum&0xFFFF;\
1867
}
1868

    
1869
#define DCT_SAD       DCT_SAD_MMX
1870
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1871
#define MMABS(a,z)    MMABS_MMX(a,z)
1872
DCT_SAD_FUNC(mmx)
1873
#undef MMABS
1874
#undef HSUM
1875

    
1876
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1877
#define MMABS(a,z)    MMABS_MMX2(a,z)
1878
DCT_SAD_FUNC(mmx2)
1879
#undef HSUM
1880
#undef DCT_SAD
1881

    
1882
#define DCT_SAD       DCT_SAD_SSE2
1883
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1884
DCT_SAD_FUNC(sse2)
1885
#undef MMABS
1886

    
1887
#ifdef HAVE_SSSE3
1888
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1889
DCT_SAD_FUNC(ssse3)
1890
#undef MMABS
1891
#endif
1892
#undef HSUM
1893
#undef DCT_SAD
1894

    
1895
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1896
    int sum;
1897
    long i=size;
1898
    asm volatile(
1899
        "pxor %%mm4, %%mm4 \n"
1900
        "1: \n"
1901
        "sub $8, %0 \n"
1902
        "movq (%2,%0), %%mm2 \n"
1903
        "movq (%3,%0,2), %%mm0 \n"
1904
        "movq 8(%3,%0,2), %%mm1 \n"
1905
        "punpckhbw %%mm2, %%mm3 \n"
1906
        "punpcklbw %%mm2, %%mm2 \n"
1907
        "psraw $8, %%mm3 \n"
1908
        "psraw $8, %%mm2 \n"
1909
        "psubw %%mm3, %%mm1 \n"
1910
        "psubw %%mm2, %%mm0 \n"
1911
        "pmaddwd %%mm1, %%mm1 \n"
1912
        "pmaddwd %%mm0, %%mm0 \n"
1913
        "paddd %%mm1, %%mm4 \n"
1914
        "paddd %%mm0, %%mm4 \n"
1915
        "jg 1b \n"
1916
        "movq %%mm4, %%mm3 \n"
1917
        "psrlq $32, %%mm3 \n"
1918
        "paddd %%mm3, %%mm4 \n"
1919
        "movd %%mm4, %1 \n"
1920
        :"+r"(i), "=r"(sum)
1921
        :"r"(pix1), "r"(pix2)
1922
    );
1923
    return sum;
1924
}
1925

    
1926
#endif //CONFIG_ENCODERS
1927

    
1928
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1929
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1930
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1931
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1932
        "movq "#in7", " #m3 "             \n\t" /* d */\
1933
        "movq "#in0", %%mm5               \n\t" /* D */\
1934
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1935
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1936
        "movq "#in1", %%mm5               \n\t" /* C */\
1937
        "movq "#in2", %%mm6               \n\t" /* B */\
1938
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1939
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1940
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1941
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1942
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1943
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1944
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1945
        "psraw $5, %%mm5                  \n\t"\
1946
        "packuswb %%mm5, %%mm5            \n\t"\
1947
        OP(%%mm5, out, %%mm7, d)
1948

    
1949
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1950
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1951
    uint64_t temp;\
1952
\
1953
    asm volatile(\
1954
        "pxor %%mm7, %%mm7                \n\t"\
1955
        "1:                               \n\t"\
1956
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1957
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1958
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1959
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1960
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1961
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1962
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1963
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1964
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1965
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1966
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1967
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1968
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1969
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1970
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1971
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1972
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1973
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1974
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1975
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1976
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1977
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1978
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1979
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1980
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1981
        "paddw %6, %%mm6                  \n\t"\
1982
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1983
        "psraw $5, %%mm0                  \n\t"\
1984
        "movq %%mm0, %5                   \n\t"\
1985
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1986
        \
1987
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1988
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1989
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1990
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1991
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1992
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1993
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1994
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1995
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1996
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1997
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1998
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1999
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
2000
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
2001
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
2002
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2003
        "paddw %%mm2, %%mm1               \n\t" /* a */\
2004
        "paddw %%mm6, %%mm4               \n\t" /* d */\
2005
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2006
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
2007
        "paddw %6, %%mm1                  \n\t"\
2008
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
2009
        "psraw $5, %%mm3                  \n\t"\
2010
        "movq %5, %%mm1                   \n\t"\
2011
        "packuswb %%mm3, %%mm1            \n\t"\
2012
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
2013
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2014
        \
2015
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
2016
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
2017
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
2018
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
2019
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
2020
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
2021
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
2022
        "paddw %%mm1, %%mm5               \n\t" /* b */\
2023
        "paddw %%mm4, %%mm0               \n\t" /* c */\
2024
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2025
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
2026
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
2027
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
2028
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
2029
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
2030
        "paddw %%mm3, %%mm2               \n\t" /* d */\
2031
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
2032
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
2033
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
2034
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
2035
        "paddw %%mm2, %%mm6               \n\t" /* a */\
2036
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2037
        "paddw %6, %%mm0                  \n\t"\
2038
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2039
        "psraw $5, %%mm0                  \n\t"\
2040
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2041
        \
2042
        "paddw %%mm5, %%mm3               \n\t" /* a */\
2043
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
2044
        "paddw %%mm4, %%mm6               \n\t" /* b */\
2045
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
2046
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
2047
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2048
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2049
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2050
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2051
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2052
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2053
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2054
        "paddw %6, %%mm4                  \n\t"\
2055
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2056
        "psraw $5, %%mm4                  \n\t"\
2057
        "packuswb %%mm4, %%mm0            \n\t"\
2058
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2059
        \
2060
        "add %3, %0                       \n\t"\
2061
        "add %4, %1                       \n\t"\
2062
        "decl %2                          \n\t"\
2063
        " jnz 1b                          \n\t"\
2064
        : "+a"(src), "+c"(dst), "+g"(h)\
2065
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2066
        : "memory"\
2067
    );\
2068
}\
2069
\
2070
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2071
    int i;\
2072
    int16_t temp[16];\
2073
    /* quick HACK, XXX FIXME MUST be optimized */\
2074
    for(i=0; i<h; i++)\
2075
    {\
2076
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2077
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2078
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2079
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2080
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2081
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2082
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2083
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2084
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2085
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2086
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2087
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2088
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2089
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2090
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2091
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2092
        asm volatile(\
2093
            "movq (%0), %%mm0               \n\t"\
2094
            "movq 8(%0), %%mm1              \n\t"\
2095
            "paddw %2, %%mm0                \n\t"\
2096
            "paddw %2, %%mm1                \n\t"\
2097
            "psraw $5, %%mm0                \n\t"\
2098
            "psraw $5, %%mm1                \n\t"\
2099
            "packuswb %%mm1, %%mm0          \n\t"\
2100
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2101
            "movq 16(%0), %%mm0             \n\t"\
2102
            "movq 24(%0), %%mm1             \n\t"\
2103
            "paddw %2, %%mm0                \n\t"\
2104
            "paddw %2, %%mm1                \n\t"\
2105
            "psraw $5, %%mm0                \n\t"\
2106
            "psraw $5, %%mm1                \n\t"\
2107
            "packuswb %%mm1, %%mm0          \n\t"\
2108
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2109
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2110
            : "memory"\
2111
        );\
2112
        dst+=dstStride;\
2113
        src+=srcStride;\
2114
    }\
2115
}\
2116
\
2117
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2118
    uint64_t temp;\
2119
\
2120
    asm volatile(\
2121
        "pxor %%mm7, %%mm7                \n\t"\
2122
        "1:                               \n\t"\
2123
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2124
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2125
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2126
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2127
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2128
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2129
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2130
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2131
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2132
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2133
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2134
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2135
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2136
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2137
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2138
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2139
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2140
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2141
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2142
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2143
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2144
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2145
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2146
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2147
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2148
        "paddw %6, %%mm6                  \n\t"\
2149
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2150
        "psraw $5, %%mm0                  \n\t"\
2151
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2152
        \
2153
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2154
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2155
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2156
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2157
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2158
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2159
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2160
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2161
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2162
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2163
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2164
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2165
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2166
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2167
        "paddw %6, %%mm1                  \n\t"\
2168
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2169
        "psraw $5, %%mm3                  \n\t"\
2170
        "packuswb %%mm3, %%mm0            \n\t"\
2171
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2172
        \
2173
        "add %3, %0                       \n\t"\
2174
        "add %4, %1                       \n\t"\
2175
        "decl %2                          \n\t"\
2176
        " jnz 1b                          \n\t"\
2177
        : "+a"(src), "+c"(dst), "+g"(h)\
2178
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2179
        : "memory"\
2180
    );\
2181
}\
2182
\
2183
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2184
    int i;\
2185
    int16_t temp[8];\
2186
    /* quick HACK, XXX FIXME MUST be optimized */\
2187
    for(i=0; i<h; i++)\
2188
    {\
2189
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2190
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2191
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2192
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2193
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2194
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2195
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2196
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2197
        asm volatile(\
2198
            "movq (%0), %%mm0           \n\t"\
2199
            "movq 8(%0), %%mm1          \n\t"\
2200
            "paddw %2, %%mm0            \n\t"\
2201
            "paddw %2, %%mm1            \n\t"\
2202
            "psraw $5, %%mm0            \n\t"\
2203
            "psraw $5, %%mm1            \n\t"\
2204
            "packuswb %%mm1, %%mm0      \n\t"\
2205
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2206
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2207
            :"memory"\
2208
        );\
2209
        dst+=dstStride;\
2210
        src+=srcStride;\
2211
    }\
2212
}
2213

    
2214
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2215
\
2216
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2217
    uint64_t temp[17*4];\
2218
    uint64_t *temp_ptr= temp;\
2219
    int count= 17;\
2220
\
2221
    /*FIXME unroll */\
2222
    asm volatile(\
2223
        "pxor %%mm7, %%mm7              \n\t"\
2224
        "1:                             \n\t"\
2225
        "movq (%0), %%mm0               \n\t"\
2226
        "movq (%0), %%mm1               \n\t"\
2227
        "movq 8(%0), %%mm2              \n\t"\
2228
        "movq 8(%0), %%mm3              \n\t"\
2229
        "punpcklbw %%mm7, %%mm0         \n\t"\
2230
        "punpckhbw %%mm7, %%mm1         \n\t"\
2231
        "punpcklbw %%mm7, %%mm2         \n\t"\
2232
        "punpckhbw %%mm7, %%mm3         \n\t"\
2233
        "movq %%mm0, (%1)               \n\t"\
2234
        "movq %%mm1, 17*8(%1)           \n\t"\
2235
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2236
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2237
        "add $8, %1                     \n\t"\
2238
        "add %3, %0                     \n\t"\
2239
        "decl %2                        \n\t"\
2240
        " jnz 1b                        \n\t"\
2241
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2242
        : "r" ((long)srcStride)\
2243
        : "memory"\
2244
    );\
2245
    \
2246
    temp_ptr= temp;\
2247
    count=4;\
2248
    \
2249
/*FIXME reorder for speed */\
2250
    asm volatile(\
2251
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2252
        "1:                             \n\t"\
2253
        "movq (%0), %%mm0               \n\t"\
2254
        "movq 8(%0), %%mm1              \n\t"\
2255
        "movq 16(%0), %%mm2             \n\t"\
2256
        "movq 24(%0), %%mm3             \n\t"\
2257
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2258
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2259
        "add %4, %1                     \n\t"\
2260
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2261
        \
2262
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2263
        "add %4, %1                     \n\t"\
2264
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2265
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2266
        "add %4, %1                     \n\t"\
2267
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2268
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2269
        "add %4, %1                     \n\t"\
2270
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2271
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2272
        "add %4, %1                     \n\t"\
2273
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2274
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2275
        "add %4, %1                     \n\t"\
2276
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2277
        \
2278
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2279
        "add %4, %1                     \n\t"  \
2280
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2281
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2282
        \
2283
        "add $136, %0                   \n\t"\
2284
        "add %6, %1                     \n\t"\
2285
        "decl %2                        \n\t"\
2286
        " jnz 1b                        \n\t"\
2287
        \
2288
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2289
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2290
        :"memory"\
2291
    );\
2292
}\
2293
\
2294
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2295
    uint64_t temp[9*2];\
2296
    uint64_t *temp_ptr= temp;\
2297
    int count= 9;\
2298
\
2299
    /*FIXME unroll */\
2300
    asm volatile(\
2301
        "pxor %%mm7, %%mm7              \n\t"\
2302
        "1:                             \n\t"\
2303
        "movq (%0), %%mm0               \n\t"\
2304
        "movq (%0), %%mm1               \n\t"\
2305
        "punpcklbw %%mm7, %%mm0         \n\t"\
2306
        "punpckhbw %%mm7, %%mm1         \n\t"\
2307
        "movq %%mm0, (%1)               \n\t"\
2308
        "movq %%mm1, 9*8(%1)            \n\t"\
2309
        "add $8, %1                     \n\t"\
2310
        "add %3, %0                     \n\t"\
2311
        "decl %2                        \n\t"\
2312
        " jnz 1b                        \n\t"\
2313
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2314
        : "r" ((long)srcStride)\
2315
        : "memory"\
2316
    );\
2317
    \
2318
    temp_ptr= temp;\
2319
    count=2;\
2320
    \
2321
/*FIXME reorder for speed */\
2322
    asm volatile(\
2323
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2324
        "1:                             \n\t"\
2325
        "movq (%0), %%mm0               \n\t"\
2326
        "movq 8(%0), %%mm1              \n\t"\
2327
        "movq 16(%0), %%mm2             \n\t"\
2328
        "movq 24(%0), %%mm3             \n\t"\
2329
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2330
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2331
        "add %4, %1                     \n\t"\
2332
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2333
        \
2334
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2335
        "add %4, %1                     \n\t"\
2336
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2337
        \
2338
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2339
        "add %4, %1                     \n\t"\
2340
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2341
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2342
                \
2343
        "add $72, %0                    \n\t"\
2344
        "add %6, %1                     \n\t"\
2345
        "decl %2                        \n\t"\
2346
        " jnz 1b                        \n\t"\
2347
         \
2348
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2349
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2350
        : "memory"\
2351
   );\
2352
}\
2353
\
2354
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2355
    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
2356
}\
2357
\
2358
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2359
    uint64_t temp[8];\
2360
    uint8_t * const half= (uint8_t*)temp;\
2361
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2362
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2363
}\
2364
\
2365
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2366
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2367
}\
2368
\
2369
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2370
    uint64_t temp[8];\
2371
    uint8_t * const half= (uint8_t*)temp;\
2372
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2373
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2374
}\
2375
\
2376
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2377
    uint64_t temp[8];\
2378
    uint8_t * const half= (uint8_t*)temp;\
2379
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2380
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2381
}\
2382
\
2383
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2384
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2385
}\
2386
\
2387
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2388
    uint64_t temp[8];\
2389
    uint8_t * const half= (uint8_t*)temp;\
2390
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2391
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2392
}\
2393
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2394
    uint64_t half[8 + 9];\
2395
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2396
    uint8_t * const halfHV= ((uint8_t*)half);\
2397
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2398
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2399
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2400
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2401
}\
2402
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2403
    uint64_t half[8 + 9];\
2404
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2405
    uint8_t * const halfHV= ((uint8_t*)half);\
2406
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2407
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2408
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2409
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2410
}\
2411
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2412
    uint64_t half[8 + 9];\
2413
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2414
    uint8_t * const halfHV= ((uint8_t*)half);\
2415
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2416
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2417
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2418
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2419
}\
2420
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2421
    uint64_t half[8 + 9];\
2422
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2423
    uint8_t * const halfHV= ((uint8_t*)half);\
2424
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2425
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2426
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2427
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2428
}\
2429
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2430
    uint64_t half[8 + 9];\
2431
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2432
    uint8_t * const halfHV= ((uint8_t*)half);\
2433
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2434
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2435
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2436
}\
2437
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2438
    uint64_t half[8 + 9];\
2439
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2440
    uint8_t * const halfHV= ((uint8_t*)half);\
2441
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2442
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2443
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2444
}\
2445
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2446
    uint64_t half[8 + 9];\
2447
    uint8_t * const halfH= ((uint8_t*)half);\
2448
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2449
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2450
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2451
}\
2452
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2453
    uint64_t half[8 + 9];\
2454
    uint8_t * const halfH= ((uint8_t*)half);\
2455
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2456
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2457
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2458
}\
2459
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2460
    uint64_t half[9];\
2461
    uint8_t * const halfH= ((uint8_t*)half);\
2462
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2463
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2464
}\
2465
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2466
    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
2467
}\
2468
\
2469
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2470
    uint64_t temp[32];\
2471
    uint8_t * const half= (uint8_t*)temp;\
2472
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2473
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2474
}\
2475
\
2476
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2477
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2478
}\
2479
\
2480
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2481
    uint64_t temp[32];\
2482
    uint8_t * const half= (uint8_t*)temp;\
2483
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2484
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2485
}\
2486
\
2487
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2488
    uint64_t temp[32];\
2489
    uint8_t * const half= (uint8_t*)temp;\
2490
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2491
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2492
}\
2493
\
2494
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2495
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2496
}\
2497
\
2498
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2499
    uint64_t temp[32];\
2500
    uint8_t * const half= (uint8_t*)temp;\
2501
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2502
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2503
}\
2504
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2505
    uint64_t half[16*2 + 17*2];\
2506
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2507
    uint8_t * const halfHV= ((uint8_t*)half);\
2508
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2509
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2510
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2511
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2512
}\
2513
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2514
    uint64_t half[16*2 + 17*2];\
2515
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2516
    uint8_t * const halfHV= ((uint8_t*)half);\
2517
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2518
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2519
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2520
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2521
}\
2522
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2523
    uint64_t half[16*2 + 17*2];\
2524
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2525
    uint8_t * const halfHV= ((uint8_t*)half);\
2526
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2527
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2528
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2529
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2530
}\
2531
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2532
    uint64_t half[16*2 + 17*2];\
2533
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2534
    uint8_t * const halfHV= ((uint8_t*)half);\
2535
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2536
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2537
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2538
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2539
}\
2540
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2541
    uint64_t half[16*2 + 17*2];\
2542
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2543
    uint8_t * const halfHV= ((uint8_t*)half);\
2544
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2545
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2546
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2547
}\
2548
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2549
    uint64_t half[16*2 + 17*2];\
2550
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2551
    uint8_t * const halfHV= ((uint8_t*)half);\
2552
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2553
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2554
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2555
}\
2556
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2557
    uint64_t half[17*2];\
2558
    uint8_t * const halfH= ((uint8_t*)half);\
2559
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2560
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2561
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2562
}\
2563
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2564
    uint64_t half[17*2];\
2565
    uint8_t * const halfH= ((uint8_t*)half);\
2566
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2567
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2568
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2569
}\
2570
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2571
    uint64_t half[17*2];\
2572
    uint8_t * const halfH= ((uint8_t*)half);\
2573
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2574
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2575
}
2576

    
2577
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2578
#define AVG_3DNOW_OP(a,b,temp, size) \
2579
"mov" #size " " #b ", " #temp "   \n\t"\
2580
"pavgusb " #temp ", " #a "        \n\t"\
2581
"mov" #size " " #a ", " #b "      \n\t"
2582
#define AVG_MMX2_OP(a,b,temp, size) \
2583
"mov" #size " " #b ", " #temp "   \n\t"\
2584
"pavgb " #temp ", " #a "          \n\t"\
2585
"mov" #size " " #a ", " #b "      \n\t"
2586

    
2587
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2588
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2589
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2590
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2591
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2592
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2593
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2594
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2595
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2596

    
2597
/***********************************/
2598
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2599

    
2600
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2601
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2602
    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2603
}
2604
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2605
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2606
    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2607
}
2608

    
2609
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
2610
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2611
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2612
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2613
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2614
                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2615
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2616
                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2617
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2618
                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2619
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2620
    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2621
}\
2622
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2623
    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2624
}\
2625
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
2626
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
2627
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
2628
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
2629
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
2630
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
2631
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
2632
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2633

    
2634
QPEL_2TAP(put_, 16, mmx2)
2635
QPEL_2TAP(avg_, 16, mmx2)
2636
QPEL_2TAP(put_,  8, mmx2)
2637
QPEL_2TAP(avg_,  8, mmx2)
2638
QPEL_2TAP(put_, 16, 3dnow)
2639
QPEL_2TAP(avg_, 16, 3dnow)
2640
QPEL_2TAP(put_,  8, 3dnow)
2641
QPEL_2TAP(avg_,  8, 3dnow)
2642

    
2643

    
2644
#if 0
2645
static void just_return() { return; }
2646
#endif
2647

    
2648
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2649
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2650
    const int w = 8;
2651
    const int ix = ox>>(16+shift);
2652
    const int iy = oy>>(16+shift);
2653
    const int oxs = ox>>4;
2654
    const int oys = oy>>4;
2655
    const int dxxs = dxx>>4;
2656
    const int dxys = dxy>>4;
2657
    const int dyxs = dyx>>4;
2658
    const int dyys = dyy>>4;
2659
    const uint16_t r4[4] = {r,r,r,r};
2660
    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2661
    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2662
    const uint64_t shift2 = 2*shift;
2663
    uint8_t edge_buf[(h+1)*stride];
2664
    int x, y;
2665

    
2666
    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2667
    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2668
    const int dxh = dxy*(h-1);
2669
    const int dyw = dyx*(w-1);
2670
    if( // non-constant fullpel offset (3% of blocks)
2671
        ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
2672
         (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
2673
        // uses more than 16 bits of subpel mv (only at huge resolution)
2674
        || (dxx|dxy|dyx|dyy)&15 )
2675
    {
2676
        //FIXME could still use mmx for some of the rows
2677
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2678
        return;
2679
    }
2680

    
2681
    src += ix + iy*stride;
2682
    if( (unsigned)ix >= width-w ||
2683
        (unsigned)iy >= height-h )
2684
    {
2685
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2686
        src = edge_buf;
2687
    }
2688

    
2689
    asm volatile(
2690
        "movd         %0, %%mm6 \n\t"
2691
        "pxor      %%mm7, %%mm7 \n\t"
2692
        "punpcklwd %%mm6, %%mm6 \n\t"
2693
        "punpcklwd %%mm6, %%mm6 \n\t"
2694
        :: "r"(1<<shift)
2695
    );
2696

    
2697
    for(x=0; x<w; x+=4){
2698
        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2699
                            oxs - dxys + dxxs*(x+1),
2700
                            oxs - dxys + dxxs*(x+2),
2701
                            oxs - dxys + dxxs*(x+3) };
2702
        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2703
                            oys - dyys + dyxs*(x+1),
2704
                            oys - dyys + dyxs*(x+2),
2705
                            oys - dyys + dyxs*(x+3) };
2706

    
2707
        for(y=0; y<h; y++){
2708
            asm volatile(
2709
                "movq   %0,  %%mm4 \n\t"
2710
                "movq   %1,  %%mm5 \n\t"
2711
                "paddw  %2,  %%mm4 \n\t"
2712
                "paddw  %3,  %%mm5 \n\t"
2713
                "movq   %%mm4, %0  \n\t"
2714
                "movq   %%mm5, %1  \n\t"
2715
                "psrlw  $12, %%mm4 \n\t"
2716
                "psrlw  $12, %%mm5 \n\t"
2717
                : "+m"(*dx4), "+m"(*dy4)
2718
                : "m"(*dxy4), "m"(*dyy4)
2719
            );
2720

    
2721
            asm volatile(
2722
                "movq   %%mm6, %%mm2 \n\t"
2723
                "movq   %%mm6, %%mm1 \n\t"
2724
                "psubw  %%mm4, %%mm2 \n\t"
2725
                "psubw  %%mm5, %%mm1 \n\t"
2726
                "movq   %%mm2, %%mm0 \n\t"
2727
                "movq   %%mm4, %%mm3 \n\t"
2728
                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2729
                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2730
                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2731
                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2732

    
2733
                "movd   %4,    %%mm5 \n\t"
2734
                "movd   %3,    %%mm4 \n\t"
2735
                "punpcklbw %%mm7, %%mm5 \n\t"
2736
                "punpcklbw %%mm7, %%mm4 \n\t"
2737
                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2738
                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2739

    
2740
                "movd   %2,    %%mm5 \n\t"
2741
                "movd   %1,    %%mm4 \n\t"
2742
                "punpcklbw %%mm7, %%mm5 \n\t"
2743
                "punpcklbw %%mm7, %%mm4 \n\t"
2744
                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2745
                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2746
                "paddw  %5,    %%mm1 \n\t"
2747
                "paddw  %%mm3, %%mm2 \n\t"
2748
                "paddw  %%mm1, %%mm0 \n\t"
2749
                "paddw  %%mm2, %%mm0 \n\t"
2750

    
2751
                "psrlw    %6,    %%mm0 \n\t"
2752
                "packuswb %%mm0, %%mm0 \n\t"
2753
                "movd     %%mm0, %0    \n\t"
2754

    
2755
                : "=m"(dst[x+y*stride])
2756
                : "m"(src[0]), "m"(src[1]),
2757
                  "m"(src[stride]), "m"(src[stride+1]),
2758
                  "m"(*r4), "m"(shift2)
2759
            );
2760
            src += stride;
2761
        }
2762
        src += 4-h*stride;
2763
    }
2764
}
2765

    
2766
#ifdef CONFIG_ENCODERS
2767

    
2768
#define PHADDD(a, t)\
2769
    "movq "#a", "#t"                  \n\t"\
2770
    "psrlq $32, "#a"                  \n\t"\
2771
    "paddd "#t", "#a"                 \n\t"
2772
/*
2773
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2774
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2775
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2776
 */
2777
#define PMULHRW(x, y, s, o)\
2778
    "pmulhw " #s ", "#x "            \n\t"\
2779
    "pmulhw " #s ", "#y "            \n\t"\
2780
    "paddw " #o ", "#x "             \n\t"\
2781
    "paddw " #o ", "#y "             \n\t"\
2782
    "psraw $1, "#x "                 \n\t"\
2783
    "psraw $1, "#y "                 \n\t"
2784
#define DEF(x) x ## _mmx
2785
#define SET_RND MOVQ_WONE
2786
#define SCALE_OFFSET 1
2787

    
2788
#include "dsputil_mmx_qns.h"
2789

    
2790
#undef DEF
2791
#undef SET_RND
2792
#undef SCALE_OFFSET
2793
#undef PMULHRW
2794

    
2795
#define DEF(x) x ## _3dnow
2796
#define SET_RND(x)
2797
#define SCALE_OFFSET 0
2798
#define PMULHRW(x, y, s, o)\
2799
    "pmulhrw " #s ", "#x "           \n\t"\
2800
    "pmulhrw " #s ", "#y "           \n\t"
2801

    
2802
#include "dsputil_mmx_qns.h"
2803

    
2804
#undef DEF
2805
#undef SET_RND
2806
#undef SCALE_OFFSET
2807
#undef PMULHRW
2808

    
2809
#ifdef HAVE_SSSE3
2810
#undef PHADDD
2811
#define DEF(x) x ## _ssse3
2812
#define SET_RND(x)
2813
#define SCALE_OFFSET -1
2814
#define PHADDD(a, t)\
2815
    "pshufw $0x0E, "#a", "#t"         \n\t"\
2816
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
2817
#define PMULHRW(x, y, s, o)\
2818
    "pmulhrsw " #s ", "#x "          \n\t"\
2819
    "pmulhrsw " #s ", "#y "          \n\t"
2820

    
2821
#include "dsputil_mmx_qns.h"
2822

    
2823
#undef DEF
2824
#undef SET_RND
2825
#undef SCALE_OFFSET
2826
#undef PMULHRW
2827
#undef PHADDD
2828
#endif //HAVE_SSSE3
2829

    
2830
#endif /* CONFIG_ENCODERS */
2831

    
2832
#define PREFETCH(name, op) \
2833
static void name(void *mem, int stride, int h){\
2834
    const uint8_t *p= mem;\
2835
    do{\
2836
        asm volatile(#op" %0" :: "m"(*p));\
2837
        p+= stride;\
2838
    }while(--h);\
2839
}
2840
PREFETCH(prefetch_mmx2,  prefetcht0)
2841
PREFETCH(prefetch_3dnow, prefetch)
2842
#undef PREFETCH
2843

    
2844
#include "h264dsp_mmx.c"
2845

    
2846
/* CAVS specific */
2847
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2848

    
2849
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2850
    put_pixels8_mmx(dst, src, stride, 8);
2851
}
2852
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2853
    avg_pixels8_mmx(dst, src, stride, 8);
2854
}
2855
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2856
    put_pixels16_mmx(dst, src, stride, 16);
2857
}
2858
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2859
    avg_pixels16_mmx(dst, src, stride, 16);
2860
}
2861

    
2862
/* FLAC specific */
2863
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2864
                                   double *autoc);
2865

    
2866
/* VC1 specific */
2867
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2868

    
2869
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2870
    put_pixels8_mmx(dst, src, stride, 8);
2871
}
2872

    
2873
/* external functions, from idct_mmx.c */
2874
void ff_mmx_idct(DCTELEM *block);
2875
void ff_mmxext_idct(DCTELEM *block);
2876

    
2877
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2878
   converted */
2879
#ifdef CONFIG_GPL
2880
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2881
{
2882
    ff_mmx_idct (block);
2883
    put_pixels_clamped_mmx(block, dest, line_size);
2884
}
2885
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2886
{
2887
    ff_mmx_idct (block);
2888
    add_pixels_clamped_mmx(block, dest, line_size);
2889
}
2890
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2891
{
2892
    ff_mmxext_idct (block);
2893
    put_pixels_clamped_mmx(block, dest, line_size);
2894
}
2895
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2896
{
2897
    ff_mmxext_idct (block);
2898
    add_pixels_clamped_mmx(block, dest, line_size);
2899
}
2900
#endif
2901
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2902
{
2903
    ff_idct_xvid_mmx (block);
2904
    put_pixels_clamped_mmx(block, dest, line_size);
2905
}
2906
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2907
{
2908
    ff_idct_xvid_mmx (block);
2909
    add_pixels_clamped_mmx(block, dest, line_size);
2910
}
2911
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2912
{
2913
    ff_idct_xvid_mmx2 (block);
2914
    put_pixels_clamped_mmx(block, dest, line_size);
2915
}
2916
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2917
{
2918
    ff_idct_xvid_mmx2 (block);
2919
    add_pixels_clamped_mmx(block, dest, line_size);
2920
}
2921

    
2922
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2923
{
2924
    int i;
2925
    asm volatile("pxor %%mm7, %%mm7":);
2926
    for(i=0; i<blocksize; i+=2) {
2927
        asm volatile(
2928
            "movq    %0,    %%mm0 \n\t"
2929
            "movq    %1,    %%mm1 \n\t"
2930
            "movq    %%mm0, %%mm2 \n\t"
2931
            "movq    %%mm1, %%mm3 \n\t"
2932
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2933
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2934
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2935
            "pxor    %%mm2, %%mm1 \n\t"
2936
            "movq    %%mm3, %%mm4 \n\t"
2937
            "pand    %%mm1, %%mm3 \n\t"
2938
            "pandn   %%mm1, %%mm4 \n\t"
2939
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2940
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2941
            "movq    %%mm3, %1    \n\t"
2942
            "movq    %%mm0, %0    \n\t"
2943
            :"+m"(mag[i]), "+m"(ang[i])
2944
            ::"memory"
2945
        );
2946
    }
2947
    asm volatile("femms");
2948
}
2949
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2950
{
2951
    int i;
2952

    
2953
    asm volatile(
2954
            "movaps  %0,     %%xmm5 \n\t"
2955
        ::"m"(ff_pdw_80000000[0])
2956
    );
2957
    for(i=0; i<blocksize; i+=4) {
2958
        asm volatile(
2959
            "movaps  %0,     %%xmm0 \n\t"
2960
            "movaps  %1,     %%xmm1 \n\t"
2961
            "xorps   %%xmm2, %%xmm2 \n\t"
2962
            "xorps   %%xmm3, %%xmm3 \n\t"
2963
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2964
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2965
            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2966
            "xorps   %%xmm2, %%xmm1 \n\t"
2967
            "movaps  %%xmm3, %%xmm4 \n\t"
2968
            "andps   %%xmm1, %%xmm3 \n\t"
2969
            "andnps  %%xmm1, %%xmm4 \n\t"
2970
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2971
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2972
            "movaps  %%xmm3, %1     \n\t"
2973
            "movaps  %%xmm0, %0     \n\t"
2974
            :"+m"(mag[i]), "+m"(ang[i])
2975
            ::"memory"
2976
        );
2977
    }
2978
}
2979

    
2980
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2981
    long i = (len-4)*4;
2982
    asm volatile(
2983
        "1: \n\t"
2984
        "movq    (%1,%0), %%mm0 \n\t"
2985
        "movq   8(%1,%0), %%mm1 \n\t"
2986
        "pfmul   (%2,%0), %%mm0 \n\t"
2987
        "pfmul  8(%2,%0), %%mm1 \n\t"
2988
        "movq   %%mm0,  (%1,%0) \n\t"
2989
        "movq   %%mm1, 8(%1,%0) \n\t"
2990
        "sub  $16, %0 \n\t"
2991
        "jge 1b \n\t"
2992
        "femms  \n\t"
2993
        :"+r"(i)
2994
        :"r"(dst), "r"(src)
2995
        :"memory"
2996
    );
2997
}
2998
static void vector_fmul_sse(float *dst, const float *src, int len){
2999
    long i = (len-8)*4;
3000
    asm volatile(
3001
        "1: \n\t"
3002
        "movaps    (%1,%0), %%xmm0 \n\t"
3003
        "movaps  16(%1,%0), %%xmm1 \n\t"
3004
        "mulps     (%2,%0), %%xmm0 \n\t"
3005
        "mulps   16(%2,%0), %%xmm1 \n\t"
3006
        "movaps  %%xmm0,   (%1,%0) \n\t"
3007
        "movaps  %%xmm1, 16(%1,%0) \n\t"
3008
        "sub  $32, %0 \n\t"
3009
        "jge 1b \n\t"
3010
        :"+r"(i)
3011
        :"r"(dst), "r"(src)
3012
        :"memory"
3013
    );
3014
}
3015

    
3016
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
3017
    long i = len*4-16;
3018
    asm volatile(
3019
        "1: \n\t"
3020
        "pswapd   8(%1), %%mm0 \n\t"
3021
        "pswapd    (%1), %%mm1 \n\t"
3022
        "pfmul  (%3,%0), %%mm0 \n\t"
3023
        "pfmul 8(%3,%0), %%mm1 \n\t"
3024
        "movq  %%mm0,  (%2,%0) \n\t"
3025
        "movq  %%mm1, 8(%2,%0) \n\t"
3026
        "add   $16, %1 \n\t"
3027
        "sub   $16, %0 \n\t"
3028
        "jge   1b \n\t"
3029
        :"+r"(i), "+r"(src1)
3030
        :"r"(dst), "r"(src0)
3031
    );
3032
    asm volatile("femms");
3033
}
3034
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3035
    long i = len*4-32;
3036
    asm volatile(
3037
        "1: \n\t"
3038
        "movaps        16(%1), %%xmm0 \n\t"
3039
        "movaps          (%1), %%xmm1 \n\t"
3040
        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3041
        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3042
        "mulps        (%3,%0), %%xmm0 \n\t"
3043
        "mulps      16(%3,%0), %%xmm1 \n\t"
3044
        "movaps     %%xmm0,   (%2,%0) \n\t"
3045
        "movaps     %%xmm1, 16(%2,%0) \n\t"
3046
        "add    $32, %1 \n\t"
3047
        "sub    $32, %0 \n\t"
3048
        "jge    1b \n\t"
3049
        :"+r"(i), "+r"(src1)
3050
        :"r"(dst), "r"(src0)
3051
    );
3052
}
3053

    
3054
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3055
                                      const float *src2, int src3, int len, int step){
3056
    long i = (len-4)*4;
3057
    if(step == 2 && src3 == 0){
3058
        dst += (len-4)*2;
3059
        asm volatile(
3060
            "1: \n\t"
3061
            "movq   (%2,%0),  %%mm0 \n\t"
3062
            "movq  8(%2,%0),  %%mm1 \n\t"
3063
            "pfmul  (%3,%0),  %%mm0 \n\t"
3064
            "pfmul 8(%3,%0),  %%mm1 \n\t"
3065
            "pfadd  (%4,%0),  %%mm0 \n\t"
3066
            "pfadd 8(%4,%0),  %%mm1 \n\t"
3067
            "movd     %%mm0,   (%1) \n\t"
3068
            "movd     %%mm1, 16(%1) \n\t"
3069
            "psrlq      $32,  %%mm0 \n\t"
3070
            "psrlq      $32,  %%mm1 \n\t"
3071
            "movd     %%mm0,  8(%1) \n\t"
3072
            "movd     %%mm1, 24(%1) \n\t"
3073
            "sub  $32, %1 \n\t"
3074
            "sub  $16, %0 \n\t"
3075
            "jge  1b \n\t"
3076
            :"+r"(i), "+r"(dst)
3077
            :"r"(src0), "r"(src1), "r"(src2)
3078
            :"memory"
3079
        );
3080
    }
3081
    else if(step == 1 && src3 == 0){
3082
        asm volatile(
3083
            "1: \n\t"
3084
            "movq    (%2,%0), %%mm0 \n\t"
3085
            "movq   8(%2,%0), %%mm1 \n\t"
3086
            "pfmul   (%3,%0), %%mm0 \n\t"
3087
            "pfmul  8(%3,%0), %%mm1 \n\t"
3088
            "pfadd   (%4,%0), %%mm0 \n\t"
3089
            "pfadd  8(%4,%0), %%mm1 \n\t"
3090
            "movq  %%mm0,   (%1,%0) \n\t"
3091
            "movq  %%mm1,  8(%1,%0) \n\t"
3092
            "sub  $16, %0 \n\t"
3093
            "jge  1b \n\t"
3094
            :"+r"(i)
3095
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3096
            :"memory"
3097
        );
3098
    }
3099
    else
3100
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3101
    asm volatile("femms");
3102
}
3103
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3104
                                    const float *src2, int src3, int len, int step){
3105
    long i = (len-8)*4;
3106
    if(step == 2 && src3 == 0){
3107
        dst += (len-8)*2;
3108
        asm volatile(
3109
            "1: \n\t"
3110
            "movaps   (%2,%0), %%xmm0 \n\t"
3111
            "movaps 16(%2,%0), %%xmm1 \n\t"
3112
            "mulps    (%3,%0), %%xmm0 \n\t"
3113
            "mulps  16(%3,%0), %%xmm1 \n\t"
3114
            "addps    (%4,%0), %%xmm0 \n\t"
3115
            "addps  16(%4,%0), %%xmm1 \n\t"
3116
            "movss     %%xmm0,   (%1) \n\t"
3117
            "movss     %%xmm1, 32(%1) \n\t"
3118
            "movhlps   %%xmm0, %%xmm2 \n\t"
3119
            "movhlps   %%xmm1, %%xmm3 \n\t"
3120
            "movss     %%xmm2, 16(%1) \n\t"
3121
            "movss     %%xmm3, 48(%1) \n\t"
3122
            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3123
            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3124
            "movss     %%xmm0,  8(%1) \n\t"
3125
            "movss     %%xmm1, 40(%1) \n\t"
3126
            "movhlps   %%xmm0, %%xmm2 \n\t"
3127
            "movhlps   %%xmm1, %%xmm3 \n\t"
3128
            "movss     %%xmm2, 24(%1) \n\t"
3129
            "movss     %%xmm3, 56(%1) \n\t"
3130
            "sub  $64, %1 \n\t"
3131
            "sub  $32, %0 \n\t"
3132
            "jge  1b \n\t"
3133
            :"+r"(i), "+r"(dst)
3134
            :"r"(src0), "r"(src1), "r"(src2)
3135
            :"memory"
3136
        );
3137
    }
3138
    else if(step == 1 && src3 == 0){
3139
        asm volatile(
3140
            "1: \n\t"
3141
            "movaps   (%2,%0), %%xmm0 \n\t"
3142
            "movaps 16(%2,%0), %%xmm1 \n\t"
3143
            "mulps    (%3,%0), %%xmm0 \n\t"
3144
            "mulps  16(%3,%0), %%xmm1 \n\t"
3145
            "addps    (%4,%0), %%xmm0 \n\t"
3146
            "addps  16(%4,%0), %%xmm1 \n\t"
3147
            "movaps %%xmm0,   (%1,%0) \n\t"
3148
            "movaps %%xmm1, 16(%1,%0) \n\t"
3149
            "sub  $32, %0 \n\t"
3150
            "jge  1b \n\t"
3151
            :"+r"(i)
3152
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3153
            :"memory"
3154
        );
3155
    }
3156
    else
3157
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3158
}
3159

    
3160
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3161
    // not bit-exact: pf2id uses different rounding than C and SSE
3162
    int i;
3163
    for(i=0; i<len; i+=4) {
3164
        asm volatile(
3165
            "pf2id       %1, %%mm0 \n\t"
3166
            "pf2id       %2, %%mm1 \n\t"
3167
            "packssdw %%mm1, %%mm0 \n\t"
3168
            "movq     %%mm0, %0    \n\t"
3169
            :"=m"(dst[i])
3170
            :"m"(src[i]), "m"(src[i+2])
3171
        );
3172
    }
3173
    asm volatile("femms");
3174
}
3175
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3176
    int i;
3177
    for(i=0; i<len; i+=4) {
3178
        asm volatile(
3179
            "cvtps2pi    %1, %%mm0 \n\t"
3180
            "cvtps2pi    %2, %%mm1 \n\t"
3181
            "packssdw %%mm1, %%mm0 \n\t"
3182
            "movq     %%mm0, %0    \n\t"
3183
            :"=m"(dst[i])
3184
            :"m"(src[i]), "m"(src[i+2])
3185
        );
3186
    }
3187
    asm volatile("emms");
3188
}
3189

    
3190
extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
3191
extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
3192
extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
3193
extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
3194
extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3195
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3196
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3197
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3198

    
3199
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3200
{
3201
    mm_flags = mm_support();
3202

    
3203
    if (avctx->dsp_mask) {
3204
        if (avctx->dsp_mask & FF_MM_FORCE)
3205
            mm_flags |= (avctx->dsp_mask & 0xffff);