Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 28748a91

History | View | Annotate | Download (137 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "dsputil.h"
26
#include "dsputil_mmx.h"
27
#include "simple_idct.h"
28
#include "mpegvideo.h"
29
#include "x86_cpu.h"
30
#include "mmx.h"
31
#include "vp3dsp_mmx.h"
32
#include "vp3dsp_sse2.h"
33
#include "h263.h"
34

    
35
//#undef NDEBUG
36
//#include <assert.h>
37

    
38
extern void ff_idct_xvid_mmx(short *block);
39
extern void ff_idct_xvid_mmx2(short *block);
40

    
41
int mm_flags; /* multimedia extension flags */
42

    
43
/* pixel operations */
44
DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45
DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
46

    
47
DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48
{0x8000000080000000ULL, 0x8000000080000000ULL};
49

    
50
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
51
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
52
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5  ) = 0x0005000500050005ULL;
53
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8  ) = 0x0008000800080008ULL;
54
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
56
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL;
58
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61
DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62

    
63
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
64
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
65
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
66
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
67
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
68
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
69

    
70
DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71
DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
72

    
73
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
75

    
76
#define MOVQ_WONE(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79
    "psrlw $15, %%" #regd ::)
80

    
81
#define MOVQ_BFE(regd) \
82
    __asm __volatile ( \
83
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
85

    
86
#ifndef PIC
87
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
89
#else
90
// for shared library it's better to use this way for accessing constants
91
// pcmpeqd -> -1
92
#define MOVQ_BONE(regd) \
93
    __asm __volatile ( \
94
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95
    "psrlw $15, %%" #regd " \n\t" \
96
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
97

    
98
#define MOVQ_WTWO(regd) \
99
    __asm __volatile ( \
100
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101
    "psrlw $15, %%" #regd " \n\t" \
102
    "psllw $1, %%" #regd " \n\t"::)
103

    
104
#endif
105

    
106
// using regr as temporary and for the output result
107
// first argument is unmodifed and second is trashed
108
// regfe is supposed to contain 0xfefefefefefefefe
109
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110
    "movq " #rega ", " #regr "  \n\t"\
111
    "pand " #regb ", " #regr "  \n\t"\
112
    "pxor " #rega ", " #regb "  \n\t"\
113
    "pand " #regfe "," #regb "  \n\t"\
114
    "psrlq $1, " #regb "        \n\t"\
115
    "paddb " #regb ", " #regr " \n\t"
116

    
117
#define PAVGB_MMX(rega, regb, regr, regfe) \
118
    "movq " #rega ", " #regr "  \n\t"\
119
    "por  " #regb ", " #regr "  \n\t"\
120
    "pxor " #rega ", " #regb "  \n\t"\
121
    "pand " #regfe "," #regb "  \n\t"\
122
    "psrlq $1, " #regb "        \n\t"\
123
    "psubb " #regb ", " #regr " \n\t"
124

    
125
// mm6 is supposed to contain 0xfefefefefefefefe
126
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
127
    "movq " #rega ", " #regr "  \n\t"\
128
    "movq " #regc ", " #regp "  \n\t"\
129
    "pand " #regb ", " #regr "  \n\t"\
130
    "pand " #regd ", " #regp "  \n\t"\
131
    "pxor " #rega ", " #regb "  \n\t"\
132
    "pxor " #regc ", " #regd "  \n\t"\
133
    "pand %%mm6, " #regb "      \n\t"\
134
    "pand %%mm6, " #regd "      \n\t"\
135
    "psrlq $1, " #regb "        \n\t"\
136
    "psrlq $1, " #regd "        \n\t"\
137
    "paddb " #regb ", " #regr " \n\t"\
138
    "paddb " #regd ", " #regp " \n\t"
139

    
140
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141
    "movq " #rega ", " #regr "  \n\t"\
142
    "movq " #regc ", " #regp "  \n\t"\
143
    "por  " #regb ", " #regr "  \n\t"\
144
    "por  " #regd ", " #regp "  \n\t"\
145
    "pxor " #rega ", " #regb "  \n\t"\
146
    "pxor " #regc ", " #regd "  \n\t"\
147
    "pand %%mm6, " #regb "      \n\t"\
148
    "pand %%mm6, " #regd "      \n\t"\
149
    "psrlq $1, " #regd "        \n\t"\
150
    "psrlq $1, " #regb "        \n\t"\
151
    "psubb " #regb ", " #regr " \n\t"\
152
    "psubb " #regd ", " #regp " \n\t"
153

    
154
/***********************************/
155
/* MMX no rounding */
156
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157
#define SET_RND  MOVQ_WONE
158
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
160

    
161
#include "dsputil_mmx_rnd.h"
162

    
163
#undef DEF
164
#undef SET_RND
165
#undef PAVGBP
166
#undef PAVGB
167
/***********************************/
168
/* MMX rounding */
169

    
170
#define DEF(x, y) x ## _ ## y ##_mmx
171
#define SET_RND  MOVQ_WTWO
172
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
173
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
174

    
175
#include "dsputil_mmx_rnd.h"
176

    
177
#undef DEF
178
#undef SET_RND
179
#undef PAVGBP
180
#undef PAVGB
181

    
182
/***********************************/
183
/* 3Dnow specific */
184

    
185
#define DEF(x) x ## _3dnow
186
#define PAVGB "pavgusb"
187

    
188
#include "dsputil_mmx_avg.h"
189

    
190
#undef DEF
191
#undef PAVGB
192

    
193
/***********************************/
194
/* MMX2 specific */
195

    
196
#define DEF(x) x ## _mmx2
197

    
198
/* Introduced only in MMX2 set */
199
#define PAVGB "pavgb"
200

    
201
#include "dsputil_mmx_avg.h"
202

    
203
#undef DEF
204
#undef PAVGB
205

    
206
/***********************************/
207
/* standard MMX */
208

    
209
#ifdef CONFIG_ENCODERS
210
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
211
{
212
    asm volatile(
213
        "mov $-128, %%"REG_a"           \n\t"
214
        "pxor %%mm7, %%mm7              \n\t"
215
        ASMALIGN(4)
216
        "1:                             \n\t"
217
        "movq (%0), %%mm0               \n\t"
218
        "movq (%0, %2), %%mm2           \n\t"
219
        "movq %%mm0, %%mm1              \n\t"
220
        "movq %%mm2, %%mm3              \n\t"
221
        "punpcklbw %%mm7, %%mm0         \n\t"
222
        "punpckhbw %%mm7, %%mm1         \n\t"
223
        "punpcklbw %%mm7, %%mm2         \n\t"
224
        "punpckhbw %%mm7, %%mm3         \n\t"
225
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
226
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
227
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
228
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
229
        "add %3, %0                     \n\t"
230
        "add $32, %%"REG_a"             \n\t"
231
        "js 1b                          \n\t"
232
        : "+r" (pixels)
233
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
234
        : "%"REG_a
235
    );
236
}
237

    
238
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
239
{
240
    asm volatile(
241
        "pxor %%mm7, %%mm7              \n\t"
242
        "mov $-128, %%"REG_a"           \n\t"
243
        ASMALIGN(4)
244
        "1:                             \n\t"
245
        "movq (%0), %%mm0               \n\t"
246
        "movq (%1), %%mm2               \n\t"
247
        "movq %%mm0, %%mm1              \n\t"
248
        "movq %%mm2, %%mm3              \n\t"
249
        "punpcklbw %%mm7, %%mm0         \n\t"
250
        "punpckhbw %%mm7, %%mm1         \n\t"
251
        "punpcklbw %%mm7, %%mm2         \n\t"
252
        "punpckhbw %%mm7, %%mm3         \n\t"
253
        "psubw %%mm2, %%mm0             \n\t"
254
        "psubw %%mm3, %%mm1             \n\t"
255
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
256
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
257
        "add %3, %0                     \n\t"
258
        "add %3, %1                     \n\t"
259
        "add $16, %%"REG_a"             \n\t"
260
        "jnz 1b                         \n\t"
261
        : "+r" (s1), "+r" (s2)
262
        : "r" (block+64), "r" ((long)stride)
263
        : "%"REG_a
264
    );
265
}
266
#endif //CONFIG_ENCODERS
267

    
268
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
269
{
270
    const DCTELEM *p;
271
    uint8_t *pix;
272

    
273
    /* read the pixels */
274
    p = block;
275
    pix = pixels;
276
    /* unrolled loop */
277
        __asm __volatile(
278
                "movq   %3, %%mm0               \n\t"
279
                "movq   8%3, %%mm1              \n\t"
280
                "movq   16%3, %%mm2             \n\t"
281
                "movq   24%3, %%mm3             \n\t"
282
                "movq   32%3, %%mm4             \n\t"
283
                "movq   40%3, %%mm5             \n\t"
284
                "movq   48%3, %%mm6             \n\t"
285
                "movq   56%3, %%mm7             \n\t"
286
                "packuswb %%mm1, %%mm0          \n\t"
287
                "packuswb %%mm3, %%mm2          \n\t"
288
                "packuswb %%mm5, %%mm4          \n\t"
289
                "packuswb %%mm7, %%mm6          \n\t"
290
                "movq   %%mm0, (%0)             \n\t"
291
                "movq   %%mm2, (%0, %1)         \n\t"
292
                "movq   %%mm4, (%0, %1, 2)      \n\t"
293
                "movq   %%mm6, (%0, %2)         \n\t"
294
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
295
                :"memory");
296
        pix += line_size*4;
297
        p += 32;
298

    
299
    // if here would be an exact copy of the code above
300
    // compiler would generate some very strange code
301
    // thus using "r"
302
    __asm __volatile(
303
            "movq       (%3), %%mm0             \n\t"
304
            "movq       8(%3), %%mm1            \n\t"
305
            "movq       16(%3), %%mm2           \n\t"
306
            "movq       24(%3), %%mm3           \n\t"
307
            "movq       32(%3), %%mm4           \n\t"
308
            "movq       40(%3), %%mm5           \n\t"
309
            "movq       48(%3), %%mm6           \n\t"
310
            "movq       56(%3), %%mm7           \n\t"
311
            "packuswb %%mm1, %%mm0              \n\t"
312
            "packuswb %%mm3, %%mm2              \n\t"
313
            "packuswb %%mm5, %%mm4              \n\t"
314
            "packuswb %%mm7, %%mm6              \n\t"
315
            "movq       %%mm0, (%0)             \n\t"
316
            "movq       %%mm2, (%0, %1)         \n\t"
317
            "movq       %%mm4, (%0, %1, 2)      \n\t"
318
            "movq       %%mm6, (%0, %2)         \n\t"
319
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
320
            :"memory");
321
}
322

    
323
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
324
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
325

    
326
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
327
{
328
    int i;
329

    
330
    movq_m2r(*vector128, mm1);
331
    for (i = 0; i < 8; i++) {
332
        movq_m2r(*(block), mm0);
333
        packsswb_m2r(*(block + 4), mm0);
334
        block += 8;
335
        paddb_r2r(mm1, mm0);
336
        movq_r2m(mm0, *pixels);
337
        pixels += line_size;
338
    }
339
}
340

    
341
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
342
{
343
    const DCTELEM *p;
344
    uint8_t *pix;
345
    int i;
346

    
347
    /* read the pixels */
348
    p = block;
349
    pix = pixels;
350
    MOVQ_ZERO(mm7);
351
    i = 4;
352
    do {
353
        __asm __volatile(
354
                "movq   (%2), %%mm0     \n\t"
355
                "movq   8(%2), %%mm1    \n\t"
356
                "movq   16(%2), %%mm2   \n\t"
357
                "movq   24(%2), %%mm3   \n\t"
358
                "movq   %0, %%mm4       \n\t"
359
                "movq   %1, %%mm6       \n\t"
360
                "movq   %%mm4, %%mm5    \n\t"
361
                "punpcklbw %%mm7, %%mm4 \n\t"
362
                "punpckhbw %%mm7, %%mm5 \n\t"
363
                "paddsw %%mm4, %%mm0    \n\t"
364
                "paddsw %%mm5, %%mm1    \n\t"
365
                "movq   %%mm6, %%mm5    \n\t"
366
                "punpcklbw %%mm7, %%mm6 \n\t"
367
                "punpckhbw %%mm7, %%mm5 \n\t"
368
                "paddsw %%mm6, %%mm2    \n\t"
369
                "paddsw %%mm5, %%mm3    \n\t"
370
                "packuswb %%mm1, %%mm0  \n\t"
371
                "packuswb %%mm3, %%mm2  \n\t"
372
                "movq   %%mm0, %0       \n\t"
373
                "movq   %%mm2, %1       \n\t"
374
                :"+m"(*pix), "+m"(*(pix+line_size))
375
                :"r"(p)
376
                :"memory");
377
        pix += line_size*2;
378
        p += 16;
379
    } while (--i);
380
}
381

    
382
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
383
{
384
    __asm __volatile(
385
         "lea (%3, %3), %%"REG_a"       \n\t"
386
         ASMALIGN(3)
387
         "1:                            \n\t"
388
         "movd (%1), %%mm0              \n\t"
389
         "movd (%1, %3), %%mm1          \n\t"
390
         "movd %%mm0, (%2)              \n\t"
391
         "movd %%mm1, (%2, %3)          \n\t"
392
         "add %%"REG_a", %1             \n\t"
393
         "add %%"REG_a", %2             \n\t"
394
         "movd (%1), %%mm0              \n\t"
395
         "movd (%1, %3), %%mm1          \n\t"
396
         "movd %%mm0, (%2)              \n\t"
397
         "movd %%mm1, (%2, %3)          \n\t"
398
         "add %%"REG_a", %1             \n\t"
399
         "add %%"REG_a", %2             \n\t"
400
         "subl $4, %0                   \n\t"
401
         "jnz 1b                        \n\t"
402
         : "+g"(h), "+r" (pixels),  "+r" (block)
403
         : "r"((long)line_size)
404
         : "%"REG_a, "memory"
405
        );
406
}
407

    
408
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
409
{
410
    __asm __volatile(
411
         "lea (%3, %3), %%"REG_a"       \n\t"
412
         ASMALIGN(3)
413
         "1:                            \n\t"
414
         "movq (%1), %%mm0              \n\t"
415
         "movq (%1, %3), %%mm1          \n\t"
416
         "movq %%mm0, (%2)              \n\t"
417
         "movq %%mm1, (%2, %3)          \n\t"
418
         "add %%"REG_a", %1             \n\t"
419
         "add %%"REG_a", %2             \n\t"
420
         "movq (%1), %%mm0              \n\t"
421
         "movq (%1, %3), %%mm1          \n\t"
422
         "movq %%mm0, (%2)              \n\t"
423
         "movq %%mm1, (%2, %3)          \n\t"
424
         "add %%"REG_a", %1             \n\t"
425
         "add %%"REG_a", %2             \n\t"
426
         "subl $4, %0                   \n\t"
427
         "jnz 1b                        \n\t"
428
         : "+g"(h), "+r" (pixels),  "+r" (block)
429
         : "r"((long)line_size)
430
         : "%"REG_a, "memory"
431
        );
432
}
433

    
434
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
435
{
436
    __asm __volatile(
437
         "lea (%3, %3), %%"REG_a"       \n\t"
438
         ASMALIGN(3)
439
         "1:                            \n\t"
440
         "movq (%1), %%mm0              \n\t"
441
         "movq 8(%1), %%mm4             \n\t"
442
         "movq (%1, %3), %%mm1          \n\t"
443
         "movq 8(%1, %3), %%mm5         \n\t"
444
         "movq %%mm0, (%2)              \n\t"
445
         "movq %%mm4, 8(%2)             \n\t"
446
         "movq %%mm1, (%2, %3)          \n\t"
447
         "movq %%mm5, 8(%2, %3)         \n\t"
448
         "add %%"REG_a", %1             \n\t"
449
         "add %%"REG_a", %2             \n\t"
450
         "movq (%1), %%mm0              \n\t"
451
         "movq 8(%1), %%mm4             \n\t"
452
         "movq (%1, %3), %%mm1          \n\t"
453
         "movq 8(%1, %3), %%mm5         \n\t"
454
         "movq %%mm0, (%2)              \n\t"
455
         "movq %%mm4, 8(%2)             \n\t"
456
         "movq %%mm1, (%2, %3)          \n\t"
457
         "movq %%mm5, 8(%2, %3)         \n\t"
458
         "add %%"REG_a", %1             \n\t"
459
         "add %%"REG_a", %2             \n\t"
460
         "subl $4, %0                   \n\t"
461
         "jnz 1b                        \n\t"
462
         : "+g"(h), "+r" (pixels),  "+r" (block)
463
         : "r"((long)line_size)
464
         : "%"REG_a, "memory"
465
        );
466
}
467

    
468
static void clear_blocks_mmx(DCTELEM *blocks)
469
{
470
    __asm __volatile(
471
                "pxor %%mm7, %%mm7              \n\t"
472
                "mov $-128*6, %%"REG_a"         \n\t"
473
                "1:                             \n\t"
474
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
475
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
476
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
477
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
478
                "add $32, %%"REG_a"             \n\t"
479
                " js 1b                         \n\t"
480
                : : "r" (((uint8_t *)blocks)+128*6)
481
                : "%"REG_a
482
        );
483
}
484

    
485
#ifdef CONFIG_ENCODERS
486
static int pix_sum16_mmx(uint8_t * pix, int line_size){
487
    const int h=16;
488
    int sum;
489
    long index= -line_size*h;
490

    
491
    __asm __volatile(
492
                "pxor %%mm7, %%mm7              \n\t"
493
                "pxor %%mm6, %%mm6              \n\t"
494
                "1:                             \n\t"
495
                "movq (%2, %1), %%mm0           \n\t"
496
                "movq (%2, %1), %%mm1           \n\t"
497
                "movq 8(%2, %1), %%mm2          \n\t"
498
                "movq 8(%2, %1), %%mm3          \n\t"
499
                "punpcklbw %%mm7, %%mm0         \n\t"
500
                "punpckhbw %%mm7, %%mm1         \n\t"
501
                "punpcklbw %%mm7, %%mm2         \n\t"
502
                "punpckhbw %%mm7, %%mm3         \n\t"
503
                "paddw %%mm0, %%mm1             \n\t"
504
                "paddw %%mm2, %%mm3             \n\t"
505
                "paddw %%mm1, %%mm3             \n\t"
506
                "paddw %%mm3, %%mm6             \n\t"
507
                "add %3, %1                     \n\t"
508
                " js 1b                         \n\t"
509
                "movq %%mm6, %%mm5              \n\t"
510
                "psrlq $32, %%mm6               \n\t"
511
                "paddw %%mm5, %%mm6             \n\t"
512
                "movq %%mm6, %%mm5              \n\t"
513
                "psrlq $16, %%mm6               \n\t"
514
                "paddw %%mm5, %%mm6             \n\t"
515
                "movd %%mm6, %0                 \n\t"
516
                "andl $0xFFFF, %0               \n\t"
517
                : "=&r" (sum), "+r" (index)
518
                : "r" (pix - index), "r" ((long)line_size)
519
        );
520

    
521
        return sum;
522
}
523
#endif //CONFIG_ENCODERS
524

    
525
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
526
    long i=0;
527
    asm volatile(
528
        "1:                             \n\t"
529
        "movq  (%1, %0), %%mm0          \n\t"
530
        "movq  (%2, %0), %%mm1          \n\t"
531
        "paddb %%mm0, %%mm1             \n\t"
532
        "movq %%mm1, (%2, %0)           \n\t"
533
        "movq 8(%1, %0), %%mm0          \n\t"
534
        "movq 8(%2, %0), %%mm1          \n\t"
535
        "paddb %%mm0, %%mm1             \n\t"
536
        "movq %%mm1, 8(%2, %0)          \n\t"
537
        "add $16, %0                    \n\t"
538
        "cmp %3, %0                     \n\t"
539
        " jb 1b                         \n\t"
540
        : "+r" (i)
541
        : "r"(src), "r"(dst), "r"((long)w-15)
542
    );
543
    for(; i<w; i++)
544
        dst[i+0] += src[i+0];
545
}
546

    
547
#define H263_LOOP_FILTER \
548
        "pxor %%mm7, %%mm7              \n\t"\
549
        "movq  %0, %%mm0                \n\t"\
550
        "movq  %0, %%mm1                \n\t"\
551
        "movq  %3, %%mm2                \n\t"\
552
        "movq  %3, %%mm3                \n\t"\
553
        "punpcklbw %%mm7, %%mm0         \n\t"\
554
        "punpckhbw %%mm7, %%mm1         \n\t"\
555
        "punpcklbw %%mm7, %%mm2         \n\t"\
556
        "punpckhbw %%mm7, %%mm3         \n\t"\
557
        "psubw %%mm2, %%mm0             \n\t"\
558
        "psubw %%mm3, %%mm1             \n\t"\
559
        "movq  %1, %%mm2                \n\t"\
560
        "movq  %1, %%mm3                \n\t"\
561
        "movq  %2, %%mm4                \n\t"\
562
        "movq  %2, %%mm5                \n\t"\
563
        "punpcklbw %%mm7, %%mm2         \n\t"\
564
        "punpckhbw %%mm7, %%mm3         \n\t"\
565
        "punpcklbw %%mm7, %%mm4         \n\t"\
566
        "punpckhbw %%mm7, %%mm5         \n\t"\
567
        "psubw %%mm2, %%mm4             \n\t"\
568
        "psubw %%mm3, %%mm5             \n\t"\
569
        "psllw $2, %%mm4                \n\t"\
570
        "psllw $2, %%mm5                \n\t"\
571
        "paddw %%mm0, %%mm4             \n\t"\
572
        "paddw %%mm1, %%mm5             \n\t"\
573
        "pxor %%mm6, %%mm6              \n\t"\
574
        "pcmpgtw %%mm4, %%mm6           \n\t"\
575
        "pcmpgtw %%mm5, %%mm7           \n\t"\
576
        "pxor %%mm6, %%mm4              \n\t"\
577
        "pxor %%mm7, %%mm5              \n\t"\
578
        "psubw %%mm6, %%mm4             \n\t"\
579
        "psubw %%mm7, %%mm5             \n\t"\
580
        "psrlw $3, %%mm4                \n\t"\
581
        "psrlw $3, %%mm5                \n\t"\
582
        "packuswb %%mm5, %%mm4          \n\t"\
583
        "packsswb %%mm7, %%mm6          \n\t"\
584
        "pxor %%mm7, %%mm7              \n\t"\
585
        "movd %4, %%mm2                 \n\t"\
586
        "punpcklbw %%mm2, %%mm2         \n\t"\
587
        "punpcklbw %%mm2, %%mm2         \n\t"\
588
        "punpcklbw %%mm2, %%mm2         \n\t"\
589
        "psubusb %%mm4, %%mm2           \n\t"\
590
        "movq %%mm2, %%mm3              \n\t"\
591
        "psubusb %%mm4, %%mm3           \n\t"\
592
        "psubb %%mm3, %%mm2             \n\t"\
593
        "movq %1, %%mm3                 \n\t"\
594
        "movq %2, %%mm4                 \n\t"\
595
        "pxor %%mm6, %%mm3              \n\t"\
596
        "pxor %%mm6, %%mm4              \n\t"\
597
        "paddusb %%mm2, %%mm3           \n\t"\
598
        "psubusb %%mm2, %%mm4           \n\t"\
599
        "pxor %%mm6, %%mm3              \n\t"\
600
        "pxor %%mm6, %%mm4              \n\t"\
601
        "paddusb %%mm2, %%mm2           \n\t"\
602
        "packsswb %%mm1, %%mm0          \n\t"\
603
        "pcmpgtb %%mm0, %%mm7           \n\t"\
604
        "pxor %%mm7, %%mm0              \n\t"\
605
        "psubb %%mm7, %%mm0             \n\t"\
606
        "movq %%mm0, %%mm1              \n\t"\
607
        "psubusb %%mm2, %%mm0           \n\t"\
608
        "psubb %%mm0, %%mm1             \n\t"\
609
        "pand %5, %%mm1                 \n\t"\
610
        "psrlw $2, %%mm1                \n\t"\
611
        "pxor %%mm7, %%mm1              \n\t"\
612
        "psubb %%mm7, %%mm1             \n\t"\
613
        "movq %0, %%mm5                 \n\t"\
614
        "movq %3, %%mm6                 \n\t"\
615
        "psubb %%mm1, %%mm5             \n\t"\
616
        "paddb %%mm1, %%mm6             \n\t"
617

    
618
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
619
    if(ENABLE_ANY_H263) {
620
    const int strength= ff_h263_loop_filter_strength[qscale];
621

    
622
    asm volatile(
623

    
624
        H263_LOOP_FILTER
625

    
626
        "movq %%mm3, %1                 \n\t"
627
        "movq %%mm4, %2                 \n\t"
628
        "movq %%mm5, %0                 \n\t"
629
        "movq %%mm6, %3                 \n\t"
630
        : "+m" (*(uint64_t*)(src - 2*stride)),
631
          "+m" (*(uint64_t*)(src - 1*stride)),
632
          "+m" (*(uint64_t*)(src + 0*stride)),
633
          "+m" (*(uint64_t*)(src + 1*stride))
634
        : "g" (2*strength), "m"(ff_pb_FC)
635
    );
636
    }
637
}
638

    
639
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
640
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
641
        "movd  %4, %%mm0                \n\t"
642
        "movd  %5, %%mm1                \n\t"
643
        "movd  %6, %%mm2                \n\t"
644
        "movd  %7, %%mm3                \n\t"
645
        "punpcklbw %%mm1, %%mm0         \n\t"
646
        "punpcklbw %%mm3, %%mm2         \n\t"
647
        "movq %%mm0, %%mm1              \n\t"
648
        "punpcklwd %%mm2, %%mm0         \n\t"
649
        "punpckhwd %%mm2, %%mm1         \n\t"
650
        "movd  %%mm0, %0                \n\t"
651
        "punpckhdq %%mm0, %%mm0         \n\t"
652
        "movd  %%mm0, %1                \n\t"
653
        "movd  %%mm1, %2                \n\t"
654
        "punpckhdq %%mm1, %%mm1         \n\t"
655
        "movd  %%mm1, %3                \n\t"
656

    
657
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
658
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
659
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
660
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
661
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
662
           "m" (*(uint32_t*)(src + 1*src_stride)),
663
           "m" (*(uint32_t*)(src + 2*src_stride)),
664
           "m" (*(uint32_t*)(src + 3*src_stride))
665
    );
666
}
667

    
668
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
669
    if(ENABLE_ANY_H263) {
670
    const int strength= ff_h263_loop_filter_strength[qscale];
671
    uint64_t temp[4] __attribute__ ((aligned(8)));
672
    uint8_t *btemp= (uint8_t*)temp;
673

    
674
    src -= 2;
675

    
676
    transpose4x4(btemp  , src           , 8, stride);
677
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
678
    asm volatile(
679
        H263_LOOP_FILTER // 5 3 4 6
680

    
681
        : "+m" (temp[0]),
682
          "+m" (temp[1]),
683
          "+m" (temp[2]),
684
          "+m" (temp[3])
685
        : "g" (2*strength), "m"(ff_pb_FC)
686
    );
687

    
688
    asm volatile(
689
        "movq %%mm5, %%mm1              \n\t"
690
        "movq %%mm4, %%mm0              \n\t"
691
        "punpcklbw %%mm3, %%mm5         \n\t"
692
        "punpcklbw %%mm6, %%mm4         \n\t"
693
        "punpckhbw %%mm3, %%mm1         \n\t"
694
        "punpckhbw %%mm6, %%mm0         \n\t"
695
        "movq %%mm5, %%mm3              \n\t"
696
        "movq %%mm1, %%mm6              \n\t"
697
        "punpcklwd %%mm4, %%mm5         \n\t"
698
        "punpcklwd %%mm0, %%mm1         \n\t"
699
        "punpckhwd %%mm4, %%mm3         \n\t"
700
        "punpckhwd %%mm0, %%mm6         \n\t"
701
        "movd %%mm5, (%0)               \n\t"
702
        "punpckhdq %%mm5, %%mm5         \n\t"
703
        "movd %%mm5, (%0,%2)            \n\t"
704
        "movd %%mm3, (%0,%2,2)          \n\t"
705
        "punpckhdq %%mm3, %%mm3         \n\t"
706
        "movd %%mm3, (%0,%3)            \n\t"
707
        "movd %%mm1, (%1)               \n\t"
708
        "punpckhdq %%mm1, %%mm1         \n\t"
709
        "movd %%mm1, (%1,%2)            \n\t"
710
        "movd %%mm6, (%1,%2,2)          \n\t"
711
        "punpckhdq %%mm6, %%mm6         \n\t"
712
        "movd %%mm6, (%1,%3)            \n\t"
713
        :: "r" (src),
714
           "r" (src + 4*stride),
715
           "r" ((long)   stride ),
716
           "r" ((long)(3*stride))
717
    );
718
    }
719
}
720

    
721
#ifdef CONFIG_ENCODERS
722
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
723
    int tmp;
724
  asm volatile (
725
      "movl $16,%%ecx\n"
726
      "pxor %%mm0,%%mm0\n"
727
      "pxor %%mm7,%%mm7\n"
728
      "1:\n"
729
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
730
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
731

    
732
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
733

    
734
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
735
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
736

    
737
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
738
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
739
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
740

    
741
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
742
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
743

    
744
      "pmaddwd %%mm3,%%mm3\n"
745
      "pmaddwd %%mm4,%%mm4\n"
746

    
747
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
748
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
749
      "paddd %%mm3,%%mm4\n"
750
      "paddd %%mm2,%%mm7\n"
751

    
752
      "add %2, %0\n"
753
      "paddd %%mm4,%%mm7\n"
754
      "dec %%ecx\n"
755
      "jnz 1b\n"
756

    
757
      "movq %%mm7,%%mm1\n"
758
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
759
      "paddd %%mm7,%%mm1\n"
760
      "movd %%mm1,%1\n"
761
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
762
    return tmp;
763
}
764

    
765
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
766
    int tmp;
767
  asm volatile (
768
      "movl %4,%%ecx\n"
769
      "shr $1,%%ecx\n"
770
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
771
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
772
      "1:\n"
773
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
774
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
775
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
776
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
777

    
778
      /* todo: mm1-mm2, mm3-mm4 */
779
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
780
      /*       OR the results to get absolute difference */
781
      "movq %%mm1,%%mm5\n"
782
      "movq %%mm3,%%mm6\n"
783
      "psubusb %%mm2,%%mm1\n"
784
      "psubusb %%mm4,%%mm3\n"
785
      "psubusb %%mm5,%%mm2\n"
786
      "psubusb %%mm6,%%mm4\n"
787

    
788
      "por %%mm1,%%mm2\n"
789
      "por %%mm3,%%mm4\n"
790

    
791
      /* now convert to 16-bit vectors so we can square them */
792
      "movq %%mm2,%%mm1\n"
793
      "movq %%mm4,%%mm3\n"
794

    
795
      "punpckhbw %%mm0,%%mm2\n"
796
      "punpckhbw %%mm0,%%mm4\n"
797
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
798
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
799

    
800
      "pmaddwd %%mm2,%%mm2\n"
801
      "pmaddwd %%mm4,%%mm4\n"
802
      "pmaddwd %%mm1,%%mm1\n"
803
      "pmaddwd %%mm3,%%mm3\n"
804

    
805
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
806
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
807

    
808
      "paddd %%mm2,%%mm1\n"
809
      "paddd %%mm4,%%mm3\n"
810
      "paddd %%mm1,%%mm7\n"
811
      "paddd %%mm3,%%mm7\n"
812

    
813
      "decl %%ecx\n"
814
      "jnz 1b\n"
815

    
816
      "movq %%mm7,%%mm1\n"
817
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
818
      "paddd %%mm7,%%mm1\n"
819
      "movd %%mm1,%2\n"
820
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
821
      : "r" ((long)line_size) , "m" (h)
822
      : "%ecx");
823
    return tmp;
824
}
825

    
826
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
827
    int tmp;
828
  asm volatile (
829
      "movl %4,%%ecx\n"
830
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
831
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
832
      "1:\n"
833
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
834
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
835
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
836
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
837

    
838
      /* todo: mm1-mm2, mm3-mm4 */
839
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
840
      /*       OR the results to get absolute difference */
841
      "movq %%mm1,%%mm5\n"
842
      "movq %%mm3,%%mm6\n"
843
      "psubusb %%mm2,%%mm1\n"
844
      "psubusb %%mm4,%%mm3\n"
845
      "psubusb %%mm5,%%mm2\n"
846
      "psubusb %%mm6,%%mm4\n"
847

    
848
      "por %%mm1,%%mm2\n"
849
      "por %%mm3,%%mm4\n"
850

    
851
      /* now convert to 16-bit vectors so we can square them */
852
      "movq %%mm2,%%mm1\n"
853
      "movq %%mm4,%%mm3\n"
854

    
855
      "punpckhbw %%mm0,%%mm2\n"
856
      "punpckhbw %%mm0,%%mm4\n"
857
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
858
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
859

    
860
      "pmaddwd %%mm2,%%mm2\n"
861
      "pmaddwd %%mm4,%%mm4\n"
862
      "pmaddwd %%mm1,%%mm1\n"
863
      "pmaddwd %%mm3,%%mm3\n"
864

    
865
      "add %3,%0\n"
866
      "add %3,%1\n"
867

    
868
      "paddd %%mm2,%%mm1\n"
869
      "paddd %%mm4,%%mm3\n"
870
      "paddd %%mm1,%%mm7\n"
871
      "paddd %%mm3,%%mm7\n"
872

    
873
      "decl %%ecx\n"
874
      "jnz 1b\n"
875

    
876
      "movq %%mm7,%%mm1\n"
877
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
878
      "paddd %%mm7,%%mm1\n"
879
      "movd %%mm1,%2\n"
880
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
881
      : "r" ((long)line_size) , "m" (h)
882
      : "%ecx");
883
    return tmp;
884
}
885

    
886
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
887
    int tmp;
888
  asm volatile (
889
      "shr $1,%2\n"
890
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
891
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
892
      "1:\n"
893
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
894
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
895
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
896
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
897

    
898
      /* todo: mm1-mm2, mm3-mm4 */
899
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
900
      /*       OR the results to get absolute difference */
901
      "movdqa %%xmm1,%%xmm5\n"
902
      "movdqa %%xmm3,%%xmm6\n"
903
      "psubusb %%xmm2,%%xmm1\n"
904
      "psubusb %%xmm4,%%xmm3\n"
905
      "psubusb %%xmm5,%%xmm2\n"
906
      "psubusb %%xmm6,%%xmm4\n"
907

    
908
      "por %%xmm1,%%xmm2\n"
909
      "por %%xmm3,%%xmm4\n"
910

    
911
      /* now convert to 16-bit vectors so we can square them */
912
      "movdqa %%xmm2,%%xmm1\n"
913
      "movdqa %%xmm4,%%xmm3\n"
914

    
915
      "punpckhbw %%xmm0,%%xmm2\n"
916
      "punpckhbw %%xmm0,%%xmm4\n"
917
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
918
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
919

    
920
      "pmaddwd %%xmm2,%%xmm2\n"
921
      "pmaddwd %%xmm4,%%xmm4\n"
922
      "pmaddwd %%xmm1,%%xmm1\n"
923
      "pmaddwd %%xmm3,%%xmm3\n"
924

    
925
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
926
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
927

    
928
      "paddd %%xmm2,%%xmm1\n"
929
      "paddd %%xmm4,%%xmm3\n"
930
      "paddd %%xmm1,%%xmm7\n"
931
      "paddd %%xmm3,%%xmm7\n"
932

    
933
      "decl %2\n"
934
      "jnz 1b\n"
935

    
936
      "movdqa %%xmm7,%%xmm1\n"
937
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
938
      "paddd %%xmm1,%%xmm7\n"
939
      "movdqa %%xmm7,%%xmm1\n"
940
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
941
      "paddd %%xmm1,%%xmm7\n"
942
      "movd %%xmm7,%3\n"
943
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
944
      : "r" ((long)line_size));
945
    return tmp;
946
}
947

    
948
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
949
    int tmp;
950
  asm volatile (
951
      "movl %3,%%ecx\n"
952
      "pxor %%mm7,%%mm7\n"
953
      "pxor %%mm6,%%mm6\n"
954

    
955
      "movq (%0),%%mm0\n"
956
      "movq %%mm0, %%mm1\n"
957
      "psllq $8, %%mm0\n"
958
      "psrlq $8, %%mm1\n"
959
      "psrlq $8, %%mm0\n"
960
      "movq %%mm0, %%mm2\n"
961
      "movq %%mm1, %%mm3\n"
962
      "punpcklbw %%mm7,%%mm0\n"
963
      "punpcklbw %%mm7,%%mm1\n"
964
      "punpckhbw %%mm7,%%mm2\n"
965
      "punpckhbw %%mm7,%%mm3\n"
966
      "psubw %%mm1, %%mm0\n"
967
      "psubw %%mm3, %%mm2\n"
968

    
969
      "add %2,%0\n"
970

    
971
      "movq (%0),%%mm4\n"
972
      "movq %%mm4, %%mm1\n"
973
      "psllq $8, %%mm4\n"
974
      "psrlq $8, %%mm1\n"
975
      "psrlq $8, %%mm4\n"
976
      "movq %%mm4, %%mm5\n"
977
      "movq %%mm1, %%mm3\n"
978
      "punpcklbw %%mm7,%%mm4\n"
979
      "punpcklbw %%mm7,%%mm1\n"
980
      "punpckhbw %%mm7,%%mm5\n"
981
      "punpckhbw %%mm7,%%mm3\n"
982
      "psubw %%mm1, %%mm4\n"
983
      "psubw %%mm3, %%mm5\n"
984
      "psubw %%mm4, %%mm0\n"
985
      "psubw %%mm5, %%mm2\n"
986
      "pxor %%mm3, %%mm3\n"
987
      "pxor %%mm1, %%mm1\n"
988
      "pcmpgtw %%mm0, %%mm3\n\t"
989
      "pcmpgtw %%mm2, %%mm1\n\t"
990
      "pxor %%mm3, %%mm0\n"
991
      "pxor %%mm1, %%mm2\n"
992
      "psubw %%mm3, %%mm0\n"
993
      "psubw %%mm1, %%mm2\n"
994
      "paddw %%mm0, %%mm2\n"
995
      "paddw %%mm2, %%mm6\n"
996

    
997
      "add %2,%0\n"
998
      "1:\n"
999

    
1000
      "movq (%0),%%mm0\n"
1001
      "movq %%mm0, %%mm1\n"
1002
      "psllq $8, %%mm0\n"
1003
      "psrlq $8, %%mm1\n"
1004
      "psrlq $8, %%mm0\n"
1005
      "movq %%mm0, %%mm2\n"
1006
      "movq %%mm1, %%mm3\n"
1007
      "punpcklbw %%mm7,%%mm0\n"
1008
      "punpcklbw %%mm7,%%mm1\n"
1009
      "punpckhbw %%mm7,%%mm2\n"
1010
      "punpckhbw %%mm7,%%mm3\n"
1011
      "psubw %%mm1, %%mm0\n"
1012
      "psubw %%mm3, %%mm2\n"
1013
      "psubw %%mm0, %%mm4\n"
1014
      "psubw %%mm2, %%mm5\n"
1015
      "pxor %%mm3, %%mm3\n"
1016
      "pxor %%mm1, %%mm1\n"
1017
      "pcmpgtw %%mm4, %%mm3\n\t"
1018
      "pcmpgtw %%mm5, %%mm1\n\t"
1019
      "pxor %%mm3, %%mm4\n"
1020
      "pxor %%mm1, %%mm5\n"
1021
      "psubw %%mm3, %%mm4\n"
1022
      "psubw %%mm1, %%mm5\n"
1023
      "paddw %%mm4, %%mm5\n"
1024
      "paddw %%mm5, %%mm6\n"
1025

    
1026
      "add %2,%0\n"
1027

    
1028
      "movq (%0),%%mm4\n"
1029
      "movq %%mm4, %%mm1\n"
1030
      "psllq $8, %%mm4\n"
1031
      "psrlq $8, %%mm1\n"
1032
      "psrlq $8, %%mm4\n"
1033
      "movq %%mm4, %%mm5\n"
1034
      "movq %%mm1, %%mm3\n"
1035
      "punpcklbw %%mm7,%%mm4\n"
1036
      "punpcklbw %%mm7,%%mm1\n"
1037
      "punpckhbw %%mm7,%%mm5\n"
1038
      "punpckhbw %%mm7,%%mm3\n"
1039
      "psubw %%mm1, %%mm4\n"
1040
      "psubw %%mm3, %%mm5\n"
1041
      "psubw %%mm4, %%mm0\n"
1042
      "psubw %%mm5, %%mm2\n"
1043
      "pxor %%mm3, %%mm3\n"
1044
      "pxor %%mm1, %%mm1\n"
1045
      "pcmpgtw %%mm0, %%mm3\n\t"
1046
      "pcmpgtw %%mm2, %%mm1\n\t"
1047
      "pxor %%mm3, %%mm0\n"
1048
      "pxor %%mm1, %%mm2\n"
1049
      "psubw %%mm3, %%mm0\n"
1050
      "psubw %%mm1, %%mm2\n"
1051
      "paddw %%mm0, %%mm2\n"
1052
      "paddw %%mm2, %%mm6\n"
1053

    
1054
      "add %2,%0\n"
1055
      "subl $2, %%ecx\n"
1056
      " jnz 1b\n"
1057

    
1058
      "movq %%mm6, %%mm0\n"
1059
      "punpcklwd %%mm7,%%mm0\n"
1060
      "punpckhwd %%mm7,%%mm6\n"
1061
      "paddd %%mm0, %%mm6\n"
1062

    
1063
      "movq %%mm6,%%mm0\n"
1064
      "psrlq $32, %%mm6\n"
1065
      "paddd %%mm6,%%mm0\n"
1066
      "movd %%mm0,%1\n"
1067
      : "+r" (pix1), "=r"(tmp)
1068
      : "r" ((long)line_size) , "g" (h-2)
1069
      : "%ecx");
1070
      return tmp;
1071
}
1072

    
1073
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1074
    int tmp;
1075
    uint8_t * pix= pix1;
1076
  asm volatile (
1077
      "movl %3,%%ecx\n"
1078
      "pxor %%mm7,%%mm7\n"
1079
      "pxor %%mm6,%%mm6\n"
1080

    
1081
      "movq (%0),%%mm0\n"
1082
      "movq 1(%0),%%mm1\n"
1083
      "movq %%mm0, %%mm2\n"
1084
      "movq %%mm1, %%mm3\n"
1085
      "punpcklbw %%mm7,%%mm0\n"
1086
      "punpcklbw %%mm7,%%mm1\n"
1087
      "punpckhbw %%mm7,%%mm2\n"
1088
      "punpckhbw %%mm7,%%mm3\n"
1089
      "psubw %%mm1, %%mm0\n"
1090
      "psubw %%mm3, %%mm2\n"
1091

    
1092
      "add %2,%0\n"
1093

    
1094
      "movq (%0),%%mm4\n"
1095
      "movq 1(%0),%%mm1\n"
1096
      "movq %%mm4, %%mm5\n"
1097
      "movq %%mm1, %%mm3\n"
1098
      "punpcklbw %%mm7,%%mm4\n"
1099
      "punpcklbw %%mm7,%%mm1\n"
1100
      "punpckhbw %%mm7,%%mm5\n"
1101
      "punpckhbw %%mm7,%%mm3\n"
1102
      "psubw %%mm1, %%mm4\n"
1103
      "psubw %%mm3, %%mm5\n"
1104
      "psubw %%mm4, %%mm0\n"
1105
      "psubw %%mm5, %%mm2\n"
1106
      "pxor %%mm3, %%mm3\n"
1107
      "pxor %%mm1, %%mm1\n"
1108
      "pcmpgtw %%mm0, %%mm3\n\t"
1109
      "pcmpgtw %%mm2, %%mm1\n\t"
1110
      "pxor %%mm3, %%mm0\n"
1111
      "pxor %%mm1, %%mm2\n"
1112
      "psubw %%mm3, %%mm0\n"
1113
      "psubw %%mm1, %%mm2\n"
1114
      "paddw %%mm0, %%mm2\n"
1115
      "paddw %%mm2, %%mm6\n"
1116

    
1117
      "add %2,%0\n"
1118
      "1:\n"
1119

    
1120
      "movq (%0),%%mm0\n"
1121
      "movq 1(%0),%%mm1\n"
1122
      "movq %%mm0, %%mm2\n"
1123
      "movq %%mm1, %%mm3\n"
1124
      "punpcklbw %%mm7,%%mm0\n"
1125
      "punpcklbw %%mm7,%%mm1\n"
1126
      "punpckhbw %%mm7,%%mm2\n"
1127
      "punpckhbw %%mm7,%%mm3\n"
1128
      "psubw %%mm1, %%mm0\n"
1129
      "psubw %%mm3, %%mm2\n"
1130
      "psubw %%mm0, %%mm4\n"
1131
      "psubw %%mm2, %%mm5\n"
1132
      "pxor %%mm3, %%mm3\n"
1133
      "pxor %%mm1, %%mm1\n"
1134
      "pcmpgtw %%mm4, %%mm3\n\t"
1135
      "pcmpgtw %%mm5, %%mm1\n\t"
1136
      "pxor %%mm3, %%mm4\n"
1137
      "pxor %%mm1, %%mm5\n"
1138
      "psubw %%mm3, %%mm4\n"
1139
      "psubw %%mm1, %%mm5\n"
1140
      "paddw %%mm4, %%mm5\n"
1141
      "paddw %%mm5, %%mm6\n"
1142

    
1143
      "add %2,%0\n"
1144

    
1145
      "movq (%0),%%mm4\n"
1146
      "movq 1(%0),%%mm1\n"
1147
      "movq %%mm4, %%mm5\n"
1148
      "movq %%mm1, %%mm3\n"
1149
      "punpcklbw %%mm7,%%mm4\n"
1150
      "punpcklbw %%mm7,%%mm1\n"
1151
      "punpckhbw %%mm7,%%mm5\n"
1152
      "punpckhbw %%mm7,%%mm3\n"
1153
      "psubw %%mm1, %%mm4\n"
1154
      "psubw %%mm3, %%mm5\n"
1155
      "psubw %%mm4, %%mm0\n"
1156
      "psubw %%mm5, %%mm2\n"
1157
      "pxor %%mm3, %%mm3\n"
1158
      "pxor %%mm1, %%mm1\n"
1159
      "pcmpgtw %%mm0, %%mm3\n\t"
1160
      "pcmpgtw %%mm2, %%mm1\n\t"
1161
      "pxor %%mm3, %%mm0\n"
1162
      "pxor %%mm1, %%mm2\n"
1163
      "psubw %%mm3, %%mm0\n"
1164
      "psubw %%mm1, %%mm2\n"
1165
      "paddw %%mm0, %%mm2\n"
1166
      "paddw %%mm2, %%mm6\n"
1167

    
1168
      "add %2,%0\n"
1169
      "subl $2, %%ecx\n"
1170
      " jnz 1b\n"
1171

    
1172
      "movq %%mm6, %%mm0\n"
1173
      "punpcklwd %%mm7,%%mm0\n"
1174
      "punpckhwd %%mm7,%%mm6\n"
1175
      "paddd %%mm0, %%mm6\n"
1176

    
1177
      "movq %%mm6,%%mm0\n"
1178
      "psrlq $32, %%mm6\n"
1179
      "paddd %%mm6,%%mm0\n"
1180
      "movd %%mm0,%1\n"
1181
      : "+r" (pix1), "=r"(tmp)
1182
      : "r" ((long)line_size) , "g" (h-2)
1183
      : "%ecx");
1184
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1185
}
1186

    
1187
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1188
    MpegEncContext *c = p;
1189
    int score1, score2;
1190

    
1191
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1192
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1193
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1194

    
1195
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1196
    else  return score1 + FFABS(score2)*8;
1197
}
1198

    
1199
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1200
    MpegEncContext *c = p;
1201
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1202
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1203

    
1204
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1205
    else  return score1 + FFABS(score2)*8;
1206
}
1207

    
1208
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1209
    int tmp;
1210

    
1211
    assert( (((int)pix) & 7) == 0);
1212
    assert((line_size &7) ==0);
1213

    
1214
#define SUM(in0, in1, out0, out1) \
1215
      "movq (%0), %%mm2\n"\
1216
      "movq 8(%0), %%mm3\n"\
1217
      "add %2,%0\n"\
1218
      "movq %%mm2, " #out0 "\n"\
1219
      "movq %%mm3, " #out1 "\n"\
1220
      "psubusb " #in0 ", %%mm2\n"\
1221
      "psubusb " #in1 ", %%mm3\n"\
1222
      "psubusb " #out0 ", " #in0 "\n"\
1223
      "psubusb " #out1 ", " #in1 "\n"\
1224
      "por %%mm2, " #in0 "\n"\
1225
      "por %%mm3, " #in1 "\n"\
1226
      "movq " #in0 ", %%mm2\n"\
1227
      "movq " #in1 ", %%mm3\n"\
1228
      "punpcklbw %%mm7, " #in0 "\n"\
1229
      "punpcklbw %%mm7, " #in1 "\n"\
1230
      "punpckhbw %%mm7, %%mm2\n"\
1231
      "punpckhbw %%mm7, %%mm3\n"\
1232
      "paddw " #in1 ", " #in0 "\n"\
1233
      "paddw %%mm3, %%mm2\n"\
1234
      "paddw %%mm2, " #in0 "\n"\
1235
      "paddw " #in0 ", %%mm6\n"
1236

    
1237

    
1238
  asm volatile (
1239
      "movl %3,%%ecx\n"
1240
      "pxor %%mm6,%%mm6\n"
1241
      "pxor %%mm7,%%mm7\n"
1242
      "movq (%0),%%mm0\n"
1243
      "movq 8(%0),%%mm1\n"
1244
      "add %2,%0\n"
1245
      "subl $2, %%ecx\n"
1246
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1247
      "1:\n"
1248

    
1249
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1250

    
1251
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1252

    
1253
      "subl $2, %%ecx\n"
1254
      "jnz 1b\n"
1255

    
1256
      "movq %%mm6,%%mm0\n"
1257
      "psrlq $32, %%mm6\n"
1258
      "paddw %%mm6,%%mm0\n"
1259
      "movq %%mm0,%%mm6\n"
1260
      "psrlq $16, %%mm0\n"
1261
      "paddw %%mm6,%%mm0\n"
1262
      "movd %%mm0,%1\n"
1263
      : "+r" (pix), "=r"(tmp)
1264
      : "r" ((long)line_size) , "m" (h)
1265
      : "%ecx");
1266
    return tmp & 0xFFFF;
1267
}
1268
#undef SUM
1269

    
1270
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1271
    int tmp;
1272

    
1273
    assert( (((int)pix) & 7) == 0);
1274
    assert((line_size &7) ==0);
1275

    
1276
#define SUM(in0, in1, out0, out1) \
1277
      "movq (%0), " #out0 "\n"\
1278
      "movq 8(%0), " #out1 "\n"\
1279
      "add %2,%0\n"\
1280
      "psadbw " #out0 ", " #in0 "\n"\
1281
      "psadbw " #out1 ", " #in1 "\n"\
1282
      "paddw " #in1 ", " #in0 "\n"\
1283
      "paddw " #in0 ", %%mm6\n"
1284

    
1285
  asm volatile (
1286
      "movl %3,%%ecx\n"
1287
      "pxor %%mm6,%%mm6\n"
1288
      "pxor %%mm7,%%mm7\n"
1289
      "movq (%0),%%mm0\n"
1290
      "movq 8(%0),%%mm1\n"
1291
      "add %2,%0\n"
1292
      "subl $2, %%ecx\n"
1293
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1294
      "1:\n"
1295

    
1296
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1297

    
1298
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1299

    
1300
      "subl $2, %%ecx\n"
1301
      "jnz 1b\n"
1302

    
1303
      "movd %%mm6,%1\n"
1304
      : "+r" (pix), "=r"(tmp)
1305
      : "r" ((long)line_size) , "m" (h)
1306
      : "%ecx");
1307
    return tmp;
1308
}
1309
#undef SUM
1310

    
1311
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1312
    int tmp;
1313

    
1314
    assert( (((int)pix1) & 7) == 0);
1315
    assert( (((int)pix2) & 7) == 0);
1316
    assert((line_size &7) ==0);
1317

    
1318
#define SUM(in0, in1, out0, out1) \
1319
      "movq (%0),%%mm2\n"\
1320
      "movq (%1)," #out0 "\n"\
1321
      "movq 8(%0),%%mm3\n"\
1322
      "movq 8(%1)," #out1 "\n"\
1323
      "add %3,%0\n"\
1324
      "add %3,%1\n"\
1325
      "psubb " #out0 ", %%mm2\n"\
1326
      "psubb " #out1 ", %%mm3\n"\
1327
      "pxor %%mm7, %%mm2\n"\
1328
      "pxor %%mm7, %%mm3\n"\
1329
      "movq %%mm2, " #out0 "\n"\
1330
      "movq %%mm3, " #out1 "\n"\
1331
      "psubusb " #in0 ", %%mm2\n"\
1332
      "psubusb " #in1 ", %%mm3\n"\
1333
      "psubusb " #out0 ", " #in0 "\n"\
1334
      "psubusb " #out1 ", " #in1 "\n"\
1335
      "por %%mm2, " #in0 "\n"\
1336
      "por %%mm3, " #in1 "\n"\
1337
      "movq " #in0 ", %%mm2\n"\
1338
      "movq " #in1 ", %%mm3\n"\
1339
      "punpcklbw %%mm7, " #in0 "\n"\
1340
      "punpcklbw %%mm7, " #in1 "\n"\
1341
      "punpckhbw %%mm7, %%mm2\n"\
1342
      "punpckhbw %%mm7, %%mm3\n"\
1343
      "paddw " #in1 ", " #in0 "\n"\
1344
      "paddw %%mm3, %%mm2\n"\
1345
      "paddw %%mm2, " #in0 "\n"\
1346
      "paddw " #in0 ", %%mm6\n"
1347

    
1348

    
1349
  asm volatile (
1350
      "movl %4,%%ecx\n"
1351
      "pxor %%mm6,%%mm6\n"
1352
      "pcmpeqw %%mm7,%%mm7\n"
1353
      "psllw $15, %%mm7\n"
1354
      "packsswb %%mm7, %%mm7\n"
1355
      "movq (%0),%%mm0\n"
1356
      "movq (%1),%%mm2\n"
1357
      "movq 8(%0),%%mm1\n"
1358
      "movq 8(%1),%%mm3\n"
1359
      "add %3,%0\n"
1360
      "add %3,%1\n"
1361
      "subl $2, %%ecx\n"
1362
      "psubb %%mm2, %%mm0\n"
1363
      "psubb %%mm3, %%mm1\n"
1364
      "pxor %%mm7, %%mm0\n"
1365
      "pxor %%mm7, %%mm1\n"
1366
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1367
      "1:\n"
1368

    
1369
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1370

    
1371
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1372

    
1373
      "subl $2, %%ecx\n"
1374
      "jnz 1b\n"
1375

    
1376
      "movq %%mm6,%%mm0\n"
1377
      "psrlq $32, %%mm6\n"
1378
      "paddw %%mm6,%%mm0\n"
1379
      "movq %%mm0,%%mm6\n"
1380
      "psrlq $16, %%mm0\n"
1381
      "paddw %%mm6,%%mm0\n"
1382
      "movd %%mm0,%2\n"
1383
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1384
      : "r" ((long)line_size) , "m" (h)
1385
      : "%ecx");
1386
    return tmp & 0x7FFF;
1387
}
1388
#undef SUM
1389

    
1390
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1391
    int tmp;
1392

    
1393
    assert( (((int)pix1) & 7) == 0);
1394
    assert( (((int)pix2) & 7) == 0);
1395
    assert((line_size &7) ==0);
1396

    
1397
#define SUM(in0, in1, out0, out1) \
1398
      "movq (%0)," #out0 "\n"\
1399
      "movq (%1),%%mm2\n"\
1400
      "movq 8(%0)," #out1 "\n"\
1401
      "movq 8(%1),%%mm3\n"\
1402
      "add %3,%0\n"\
1403
      "add %3,%1\n"\
1404
      "psubb %%mm2, " #out0 "\n"\
1405
      "psubb %%mm3, " #out1 "\n"\
1406
      "pxor %%mm7, " #out0 "\n"\
1407
      "pxor %%mm7, " #out1 "\n"\
1408
      "psadbw " #out0 ", " #in0 "\n"\
1409
      "psadbw " #out1 ", " #in1 "\n"\
1410
      "paddw " #in1 ", " #in0 "\n"\
1411
      "paddw " #in0 ", %%mm6\n"
1412

    
1413
  asm volatile (
1414
      "movl %4,%%ecx\n"
1415
      "pxor %%mm6,%%mm6\n"
1416
      "pcmpeqw %%mm7,%%mm7\n"
1417
      "psllw $15, %%mm7\n"
1418
      "packsswb %%mm7, %%mm7\n"
1419
      "movq (%0),%%mm0\n"
1420
      "movq (%1),%%mm2\n"
1421
      "movq 8(%0),%%mm1\n"
1422
      "movq 8(%1),%%mm3\n"
1423
      "add %3,%0\n"
1424
      "add %3,%1\n"
1425
      "subl $2, %%ecx\n"
1426
      "psubb %%mm2, %%mm0\n"
1427
      "psubb %%mm3, %%mm1\n"
1428
      "pxor %%mm7, %%mm0\n"
1429
      "pxor %%mm7, %%mm1\n"
1430
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1431
      "1:\n"
1432

    
1433
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1434

    
1435
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1436

    
1437
      "subl $2, %%ecx\n"
1438
      "jnz 1b\n"
1439

    
1440
      "movd %%mm6,%2\n"
1441
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1442
      : "r" ((long)line_size) , "m" (h)
1443
      : "%ecx");
1444
    return tmp;
1445
}
1446
#undef SUM
1447

    
1448
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1449
    long i=0;
1450
    asm volatile(
1451
        "1:                             \n\t"
1452
        "movq  (%2, %0), %%mm0          \n\t"
1453
        "movq  (%1, %0), %%mm1          \n\t"
1454
        "psubb %%mm0, %%mm1             \n\t"
1455
        "movq %%mm1, (%3, %0)           \n\t"
1456
        "movq 8(%2, %0), %%mm0          \n\t"
1457
        "movq 8(%1, %0), %%mm1          \n\t"
1458
        "psubb %%mm0, %%mm1             \n\t"
1459
        "movq %%mm1, 8(%3, %0)          \n\t"
1460
        "add $16, %0                    \n\t"
1461
        "cmp %4, %0                     \n\t"
1462
        " jb 1b                         \n\t"
1463
        : "+r" (i)
1464
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1465
    );
1466
    for(; i<w; i++)
1467
        dst[i+0] = src1[i+0]-src2[i+0];
1468
}
1469

    
1470
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1471
    long i=0;
1472
    uint8_t l, lt;
1473

    
1474
    asm volatile(
1475
        "1:                             \n\t"
1476
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1477
        "movq  (%1, %0), %%mm1          \n\t" // T
1478
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1479
        "movq  (%2, %0), %%mm3          \n\t" // X
1480
        "movq %%mm2, %%mm4              \n\t" // L
1481
        "psubb %%mm0, %%mm2             \n\t"
1482
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1483
        "movq %%mm4, %%mm5              \n\t" // L
1484
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1485
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1486
        "pminub %%mm2, %%mm4            \n\t"
1487
        "pmaxub %%mm1, %%mm4            \n\t"
1488
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1489
        "movq %%mm3, (%3, %0)           \n\t"
1490
        "add $8, %0                     \n\t"
1491
        "cmp %4, %0                     \n\t"
1492
        " jb 1b                         \n\t"
1493
        : "+r" (i)
1494
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1495
    );
1496

    
1497
    l= *left;
1498
    lt= *left_top;
1499

    
1500
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1501

    
1502
    *left_top= src1[w-1];
1503
    *left    = src2[w-1];
1504
}
1505

    
1506
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1507
    "mov"#m" "#p1", "#a"              \n\t"\
1508
    "mov"#m" "#p2", "#t"              \n\t"\
1509
    "punpcklbw "#a", "#t"             \n\t"\
1510
    "punpcklbw "#a", "#a"             \n\t"\
1511
    "psubw     "#t", "#a"             \n\t"\
1512

    
1513
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1514
    uint8_t *p1b=p1, *p2b=p2;\
1515
    asm volatile(\
1516
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1517
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1518
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1519
        "add %4, %1                   \n\t"\
1520
        "add %4, %2                   \n\t"\
1521
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1522
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1523
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1524
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1525
        "mov"#m1" "#mm"0, %0          \n\t"\
1526
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1527
        "mov"#m1" %0, "#mm"0          \n\t"\
1528
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1529
        : "r"((long)stride), "r"((long)stride*3)\
1530
    );\
1531
}
1532
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1533

    
1534
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1535
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1536

    
1537
#define LBUTTERFLY2(a1,b1,a2,b2)\
1538
    "paddw " #b1 ", " #a1 "           \n\t"\
1539
    "paddw " #b2 ", " #a2 "           \n\t"\
1540
    "paddw " #b1 ", " #b1 "           \n\t"\
1541
    "paddw " #b2 ", " #b2 "           \n\t"\
1542
    "psubw " #a1 ", " #b1 "           \n\t"\
1543
    "psubw " #a2 ", " #b2 "           \n\t"
1544

    
1545
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1546
        LBUTTERFLY2(m0, m1, m2, m3)\
1547
        LBUTTERFLY2(m4, m5, m6, m7)\
1548
        LBUTTERFLY2(m0, m2, m1, m3)\
1549
        LBUTTERFLY2(m4, m6, m5, m7)\
1550
        LBUTTERFLY2(m0, m4, m1, m5)\
1551
        LBUTTERFLY2(m2, m6, m3, m7)\
1552

    
1553
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1554

    
1555
#define MMABS_MMX(a,z)\
1556
    "pxor " #z ", " #z "              \n\t"\
1557
    "pcmpgtw " #a ", " #z "           \n\t"\
1558
    "pxor " #z ", " #a "              \n\t"\
1559
    "psubw " #z ", " #a "             \n\t"
1560

    
1561
#define MMABS_MMX2(a,z)\
1562
    "pxor " #z ", " #z "              \n\t"\
1563
    "psubw " #a ", " #z "             \n\t"\
1564
    "pmaxsw " #z ", " #a "            \n\t"
1565

    
1566
#define MMABS_SSSE3(a,z)\
1567
    "pabsw " #a ", " #a "             \n\t"
1568

    
1569
#define MMABS_SUM(a,z, sum)\
1570
    MMABS(a,z)\
1571
    "paddusw " #a ", " #sum "         \n\t"
1572

    
1573
#define MMABS_SUM_8x8_NOSPILL\
1574
    MMABS(%%xmm0, %%xmm8)\
1575
    MMABS(%%xmm1, %%xmm9)\
1576
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1577
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1578
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1579
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1580
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1581
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1582
    "paddusw %%xmm1, %%xmm0           \n\t"
1583

    
1584
#ifdef ARCH_X86_64
1585
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1586
#else
1587
#define MMABS_SUM_8x8_SSE2\
1588
    "movdqa %%xmm7, (%1)              \n\t"\
1589
    MMABS(%%xmm0, %%xmm7)\
1590
    MMABS(%%xmm1, %%xmm7)\
1591
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1592
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1593
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1594
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1595
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1596
    "movdqa (%1), %%xmm2              \n\t"\
1597
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1598
    "paddusw %%xmm1, %%xmm0           \n\t"
1599
#endif
1600

    
1601
#define LOAD4(o, a, b, c, d)\
1602
    "movq "#o"(%1),    "#a"           \n\t"\
1603
    "movq "#o"+8(%1),  "#b"           \n\t"\
1604
    "movq "#o"+16(%1), "#c"           \n\t"\
1605
    "movq "#o"+24(%1), "#d"           \n\t"\
1606

    
1607
#define STORE4(o, a, b, c, d)\
1608
    "movq "#a", "#o"(%1)              \n\t"\
1609
    "movq "#b", "#o"+8(%1)            \n\t"\
1610
    "movq "#c", "#o"+16(%1)           \n\t"\
1611
    "movq "#d", "#o"+24(%1)           \n\t"\
1612

    
1613
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1614
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1615
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1616
#define HSUM_MMX(a, t, dst)\
1617
    "movq "#a", "#t"                  \n\t"\
1618
    "psrlq $32, "#a"                  \n\t"\
1619
    "paddusw "#t", "#a"               \n\t"\
1620
    "movq "#a", "#t"                  \n\t"\
1621
    "psrlq $16, "#a"                  \n\t"\
1622
    "paddusw "#t", "#a"               \n\t"\
1623
    "movd "#a", "#dst"                \n\t"\
1624

    
1625
#define HSUM_MMX2(a, t, dst)\
1626
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1627
    "paddusw "#t", "#a"               \n\t"\
1628
    "pshufw $0x01, "#a", "#t"         \n\t"\
1629
    "paddusw "#t", "#a"               \n\t"\
1630
    "movd "#a", "#dst"                \n\t"\
1631

    
1632
#define HSUM_SSE2(a, t, dst)\
1633
    "movhlps "#a", "#t"               \n\t"\
1634
    "paddusw "#t", "#a"               \n\t"\
1635
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1636
    "paddusw "#t", "#a"               \n\t"\
1637
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1638
    "paddusw "#t", "#a"               \n\t"\
1639
    "movd "#a", "#dst"                \n\t"\
1640

    
1641
#define HADAMARD8_DIFF_MMX(cpu) \
1642
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1643
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1644
    int sum;\
1645
\
1646
    assert(h==8);\
1647
\
1648
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1649
\
1650
    asm volatile(\
1651
        HADAMARD48\
1652
\
1653
        "movq %%mm7, 96(%1)             \n\t"\
1654
\
1655
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1656
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1657
\
1658
        "movq 96(%1), %%mm7             \n\t"\
1659
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1660
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1661
\
1662
        : "=r" (sum)\
1663
        : "r"(temp)\
1664
    );\
1665
\
1666
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1667
\
1668
    asm volatile(\
1669
        HADAMARD48\
1670
\
1671
        "movq %%mm7, 96(%1)             \n\t"\
1672
\
1673
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1674
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1675
\
1676
        "movq 96(%1), %%mm7             \n\t"\
1677
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1678
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1679
        "movq %%mm6, %%mm7              \n\t"\
1680
        "movq %%mm0, %%mm6              \n\t"\
1681
\
1682
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1683
\
1684
        HADAMARD48\
1685
        "movq %%mm7, 64(%1)             \n\t"\
1686
        MMABS(%%mm0, %%mm7)\
1687
        MMABS(%%mm1, %%mm7)\
1688
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1689
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1690
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1691
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1692
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1693
        "movq 64(%1), %%mm2             \n\t"\
1694
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1695
        "paddusw %%mm1, %%mm0           \n\t"\
1696
        "movq %%mm0, 64(%1)             \n\t"\
1697
\
1698
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1699
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1700
\
1701
        HADAMARD48\
1702
        "movq %%mm7, (%1)               \n\t"\
1703
        MMABS(%%mm0, %%mm7)\
1704
        MMABS(%%mm1, %%mm7)\
1705
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1706
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1707
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1708
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1709
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1710
        "movq (%1), %%mm2               \n\t"\
1711
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1712
        "paddusw 64(%1), %%mm0          \n\t"\
1713
        "paddusw %%mm1, %%mm0           \n\t"\
1714
\
1715
        HSUM(%%mm0, %%mm1, %0)\
1716
\
1717
        : "=r" (sum)\
1718
        : "r"(temp)\
1719
    );\
1720
    return sum&0xFFFF;\
1721
}\
1722
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1723

    
1724
#define HADAMARD8_DIFF_SSE2(cpu) \
1725
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1726
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1727
    int sum;\
1728
\
1729
    assert(h==8);\
1730
\
1731
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1732
\
1733
    asm volatile(\
1734
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1735
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1736
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1737
        MMABS_SUM_8x8\
1738
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1739
        : "=r" (sum)\
1740
        : "r"(temp)\
1741
    );\
1742
    return sum&0xFFFF;\
1743
}\
1744
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1745

    
1746
#define MMABS(a,z)         MMABS_MMX(a,z)
1747
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1748
HADAMARD8_DIFF_MMX(mmx)
1749
#undef MMABS
1750
#undef HSUM
1751

    
1752
#define MMABS(a,z)         MMABS_MMX2(a,z)
1753
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1754
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1755
HADAMARD8_DIFF_MMX(mmx2)
1756
HADAMARD8_DIFF_SSE2(sse2)
1757
#undef MMABS
1758
#undef MMABS_SUM_8x8
1759
#undef HSUM
1760

    
1761
#ifdef HAVE_SSSE3
1762
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1763
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1764
HADAMARD8_DIFF_SSE2(ssse3)
1765
#undef MMABS
1766
#undef MMABS_SUM_8x8
1767
#endif
1768

    
1769
#define DCT_SAD4(m,mm,o)\
1770
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1771
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1772
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1773
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1774
    MMABS_SUM(mm##2, mm##6, mm##0)\
1775
    MMABS_SUM(mm##3, mm##7, mm##1)\
1776
    MMABS_SUM(mm##4, mm##6, mm##0)\
1777
    MMABS_SUM(mm##5, mm##7, mm##1)\
1778

    
1779
#define DCT_SAD_MMX\
1780
    "pxor %%mm0, %%mm0                \n\t"\
1781
    "pxor %%mm1, %%mm1                \n\t"\
1782
    DCT_SAD4(q, %%mm, 0)\
1783
    DCT_SAD4(q, %%mm, 8)\
1784
    DCT_SAD4(q, %%mm, 64)\
1785
    DCT_SAD4(q, %%mm, 72)\
1786
    "paddusw %%mm1, %%mm0             \n\t"\
1787
    HSUM(%%mm0, %%mm1, %0)
1788

    
1789
#define DCT_SAD_SSE2\
1790
    "pxor %%xmm0, %%xmm0              \n\t"\
1791
    "pxor %%xmm1, %%xmm1              \n\t"\
1792
    DCT_SAD4(dqa, %%xmm, 0)\
1793
    DCT_SAD4(dqa, %%xmm, 64)\
1794
    "paddusw %%xmm1, %%xmm0           \n\t"\
1795
    HSUM(%%xmm0, %%xmm1, %0)
1796

    
1797
#define DCT_SAD_FUNC(cpu) \
1798
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1799
    int sum;\
1800
    asm volatile(\
1801
        DCT_SAD\
1802
        :"=r"(sum)\
1803
        :"r"(block)\
1804
    );\
1805
    return sum&0xFFFF;\
1806
}
1807

    
1808
#define DCT_SAD       DCT_SAD_MMX
1809
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1810
#define MMABS(a,z)    MMABS_MMX(a,z)
1811
DCT_SAD_FUNC(mmx)
1812
#undef MMABS
1813
#undef HSUM
1814

    
1815
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1816
#define MMABS(a,z)    MMABS_MMX2(a,z)
1817
DCT_SAD_FUNC(mmx2)
1818
#undef HSUM
1819
#undef DCT_SAD
1820

    
1821
#define DCT_SAD       DCT_SAD_SSE2
1822
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1823
DCT_SAD_FUNC(sse2)
1824
#undef MMABS
1825

    
1826
#ifdef HAVE_SSSE3
1827
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1828
DCT_SAD_FUNC(ssse3)
1829
#undef MMABS
1830
#endif
1831
#undef HSUM
1832
#undef DCT_SAD
1833

    
1834
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1835
    int sum;
1836
    long i=size;
1837
    asm volatile(
1838
        "pxor %%mm4, %%mm4 \n"
1839
        "1: \n"
1840
        "sub $8, %0 \n"
1841
        "movq (%2,%0), %%mm2 \n"
1842
        "movq (%3,%0,2), %%mm0 \n"
1843
        "movq 8(%3,%0,2), %%mm1 \n"
1844
        "punpckhbw %%mm2, %%mm3 \n"
1845
        "punpcklbw %%mm2, %%mm2 \n"
1846
        "psraw $8, %%mm3 \n"
1847
        "psraw $8, %%mm2 \n"
1848
        "psubw %%mm3, %%mm1 \n"
1849
        "psubw %%mm2, %%mm0 \n"
1850
        "pmaddwd %%mm1, %%mm1 \n"
1851
        "pmaddwd %%mm0, %%mm0 \n"
1852
        "paddd %%mm1, %%mm4 \n"
1853
        "paddd %%mm0, %%mm4 \n"
1854
        "jg 1b \n"
1855
        "movq %%mm4, %%mm3 \n"
1856
        "psrlq $32, %%mm3 \n"
1857
        "paddd %%mm3, %%mm4 \n"
1858
        "movd %%mm4, %1 \n"
1859
        :"+r"(i), "=r"(sum)
1860
        :"r"(pix1), "r"(pix2)
1861
    );
1862
    return sum;
1863
}
1864

    
1865
#endif //CONFIG_ENCODERS
1866

    
1867
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1868
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1869

    
1870
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1871
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1872
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1873
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1874
        "movq "#in7", " #m3 "             \n\t" /* d */\
1875
        "movq "#in0", %%mm5               \n\t" /* D */\
1876
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1877
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1878
        "movq "#in1", %%mm5               \n\t" /* C */\
1879
        "movq "#in2", %%mm6               \n\t" /* B */\
1880
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1881
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1882
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1883
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1884
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1885
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1886
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1887
        "psraw $5, %%mm5                  \n\t"\
1888
        "packuswb %%mm5, %%mm5            \n\t"\
1889
        OP(%%mm5, out, %%mm7, d)
1890

    
1891
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1892
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1893
    uint64_t temp;\
1894
\
1895
    asm volatile(\
1896
        "pxor %%mm7, %%mm7                \n\t"\
1897
        "1:                               \n\t"\
1898
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1899
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1900
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1901
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1902
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1903
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1904
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1905
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1906
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1907
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1908
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1909
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1910
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1911
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1912
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1913
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1914
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1915
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1916
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1917
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1918
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1919
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1920
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1921
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1922
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1923
        "paddw %6, %%mm6                  \n\t"\
1924
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1925
        "psraw $5, %%mm0                  \n\t"\
1926
        "movq %%mm0, %5                   \n\t"\
1927
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1928
        \
1929
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1930
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1931
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1932
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1933
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1934
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1935
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1936
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1937
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1938
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1939
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1940
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1941
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1942
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1943
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1944
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1945
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1946
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1947
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1948
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
1949
        "paddw %6, %%mm1                  \n\t"\
1950
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
1951
        "psraw $5, %%mm3                  \n\t"\
1952
        "movq %5, %%mm1                   \n\t"\
1953
        "packuswb %%mm3, %%mm1            \n\t"\
1954
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
1955
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1956
        \
1957
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
1958
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
1959
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
1960
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
1961
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
1962
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
1963
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
1964
        "paddw %%mm1, %%mm5               \n\t" /* b */\
1965
        "paddw %%mm4, %%mm0               \n\t" /* c */\
1966
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1967
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
1968
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
1969
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
1970
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
1971
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
1972
        "paddw %%mm3, %%mm2               \n\t" /* d */\
1973
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
1974
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
1975
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
1976
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
1977
        "paddw %%mm2, %%mm6               \n\t" /* a */\
1978
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1979
        "paddw %6, %%mm0                  \n\t"\
1980
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1981
        "psraw $5, %%mm0                  \n\t"\
1982
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1983
        \
1984
        "paddw %%mm5, %%mm3               \n\t" /* a */\
1985
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
1986
        "paddw %%mm4, %%mm6               \n\t" /* b */\
1987
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
1988
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
1989
        "paddw %%mm1, %%mm4               \n\t" /* c */\
1990
        "paddw %%mm2, %%mm5               \n\t" /* d */\
1991
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
1992
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
1993
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1994
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
1995
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
1996
        "paddw %6, %%mm4                  \n\t"\
1997
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
1998
        "psraw $5, %%mm4                  \n\t"\
1999
        "packuswb %%mm4, %%mm0            \n\t"\
2000
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2001
        \
2002
        "add %3, %0                       \n\t"\
2003
        "add %4, %1                       \n\t"\
2004
        "decl %2                          \n\t"\
2005
        " jnz 1b                          \n\t"\
2006
        : "+a"(src), "+c"(dst), "+m"(h)\
2007
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2008
        : "memory"\
2009
    );\
2010
}\
2011
\
2012
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2013
    int i;\
2014
    int16_t temp[16];\
2015
    /* quick HACK, XXX FIXME MUST be optimized */\
2016
    for(i=0; i<h; i++)\
2017
    {\
2018
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2019
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2020
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2021
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2022
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2023
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2024
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2025
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2026
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2027
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2028
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2029
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2030
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2031
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2032
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2033
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2034
        asm volatile(\
2035
            "movq (%0), %%mm0               \n\t"\
2036
            "movq 8(%0), %%mm1              \n\t"\
2037
            "paddw %2, %%mm0                \n\t"\
2038
            "paddw %2, %%mm1                \n\t"\
2039
            "psraw $5, %%mm0                \n\t"\
2040
            "psraw $5, %%mm1                \n\t"\
2041
            "packuswb %%mm1, %%mm0          \n\t"\
2042
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2043
            "movq 16(%0), %%mm0             \n\t"\
2044
            "movq 24(%0), %%mm1             \n\t"\
2045
            "paddw %2, %%mm0                \n\t"\
2046
            "paddw %2, %%mm1                \n\t"\
2047
            "psraw $5, %%mm0                \n\t"\
2048
            "psraw $5, %%mm1                \n\t"\
2049
            "packuswb %%mm1, %%mm0          \n\t"\
2050
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2051
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2052
            : "memory"\
2053
        );\
2054
        dst+=dstStride;\
2055
        src+=srcStride;\
2056
    }\
2057
}\
2058
\
2059
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2060
    uint64_t temp;\
2061
\
2062
    asm volatile(\
2063
        "pxor %%mm7, %%mm7                \n\t"\
2064
        "1:                               \n\t"\
2065
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2066
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2067
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2068
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2069
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2070
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2071
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2072
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2073
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2074
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2075
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2076
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2077
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2078
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2079
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2080
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2081
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2082
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2083
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2084
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2085
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2086
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2087
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2088
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2089
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2090
        "paddw %6, %%mm6                  \n\t"\
2091
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2092
        "psraw $5, %%mm0                  \n\t"\
2093
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2094
        \
2095
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2096
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2097
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2098
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2099
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2100
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2101
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2102
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2103
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2104
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2105
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2106
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2107
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2108
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2109
        "paddw %6, %%mm1                  \n\t"\
2110
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2111
        "psraw $5, %%mm3                  \n\t"\
2112
        "packuswb %%mm3, %%mm0            \n\t"\
2113
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2114
        \
2115
        "add %3, %0                       \n\t"\
2116
        "add %4, %1                       \n\t"\
2117
        "decl %2                          \n\t"\
2118
        " jnz 1b                          \n\t"\
2119
        : "+a"(src), "+c"(dst), "+m"(h)\
2120
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2121
        : "memory"\
2122
    );\
2123
}\
2124
\
2125
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2126
    int i;\
2127
    int16_t temp[8];\
2128
    /* quick HACK, XXX FIXME MUST be optimized */\
2129
    for(i=0; i<h; i++)\
2130
    {\
2131
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2132
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2133
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2134
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2135
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2136
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2137
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2138
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2139
        asm volatile(\
2140
            "movq (%0), %%mm0           \n\t"\
2141
            "movq 8(%0), %%mm1          \n\t"\
2142
            "paddw %2, %%mm0            \n\t"\
2143
            "paddw %2, %%mm1            \n\t"\
2144
            "psraw $5, %%mm0            \n\t"\
2145
            "psraw $5, %%mm1            \n\t"\
2146
            "packuswb %%mm1, %%mm0      \n\t"\
2147
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2148
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2149
            :"memory"\
2150
        );\
2151
        dst+=dstStride;\
2152
        src+=srcStride;\
2153
    }\
2154
}
2155

    
2156
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2157
\
2158
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2159
    uint64_t temp[17*4];\
2160
    uint64_t *temp_ptr= temp;\
2161
    int count= 17;\
2162
\
2163
    /*FIXME unroll */\
2164
    asm volatile(\
2165
        "pxor %%mm7, %%mm7              \n\t"\
2166
        "1:                             \n\t"\
2167
        "movq (%0), %%mm0               \n\t"\
2168
        "movq (%0), %%mm1               \n\t"\
2169
        "movq 8(%0), %%mm2              \n\t"\
2170
        "movq 8(%0), %%mm3              \n\t"\
2171
        "punpcklbw %%mm7, %%mm0         \n\t"\
2172
        "punpckhbw %%mm7, %%mm1         \n\t"\
2173
        "punpcklbw %%mm7, %%mm2         \n\t"\
2174
        "punpckhbw %%mm7, %%mm3         \n\t"\
2175
        "movq %%mm0, (%1)               \n\t"\
2176
        "movq %%mm1, 17*8(%1)           \n\t"\
2177
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2178
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2179
        "add $8, %1                     \n\t"\
2180
        "add %3, %0                     \n\t"\
2181
        "decl %2                        \n\t"\
2182
        " jnz 1b                        \n\t"\
2183
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2184
        : "r" ((long)srcStride)\
2185
        : "memory"\
2186
    );\
2187
    \
2188
    temp_ptr= temp;\
2189
    count=4;\
2190
    \
2191
/*FIXME reorder for speed */\
2192
    asm volatile(\
2193
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2194
        "1:                             \n\t"\
2195
        "movq (%0), %%mm0               \n\t"\
2196
        "movq 8(%0), %%mm1              \n\t"\
2197
        "movq 16(%0), %%mm2             \n\t"\
2198
        "movq 24(%0), %%mm3             \n\t"\
2199
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2200
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2201
        "add %4, %1                     \n\t"\
2202
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2203
        \
2204
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2205
        "add %4, %1                     \n\t"\
2206
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2207
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2208
        "add %4, %1                     \n\t"\
2209
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2210
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2211
        "add %4, %1                     \n\t"\
2212
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2213
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2214
        "add %4, %1                     \n\t"\
2215
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2216
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2217
        "add %4, %1                     \n\t"\
2218
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2219
        \
2220
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2221
        "add %4, %1                     \n\t"  \
2222
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2223
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2224
        \
2225
        "add $136, %0                   \n\t"\
2226
        "add %6, %1                     \n\t"\
2227
        "decl %2                        \n\t"\
2228
        " jnz 1b                        \n\t"\
2229
        \
2230
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2231
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2232
        :"memory"\
2233
    );\
2234
}\
2235
\
2236
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2237
    uint64_t temp[9*2];\
2238
    uint64_t *temp_ptr= temp;\
2239
    int count= 9;\
2240
\
2241
    /*FIXME unroll */\
2242
    asm volatile(\
2243
        "pxor %%mm7, %%mm7              \n\t"\
2244
        "1:                             \n\t"\
2245
        "movq (%0), %%mm0               \n\t"\
2246
        "movq (%0), %%mm1               \n\t"\
2247
        "punpcklbw %%mm7, %%mm0         \n\t"\
2248
        "punpckhbw %%mm7, %%mm1         \n\t"\
2249
        "movq %%mm0, (%1)               \n\t"\
2250
        "movq %%mm1, 9*8(%1)            \n\t"\
2251
        "add $8, %1                     \n\t"\
2252
        "add %3, %0                     \n\t"\
2253
        "decl %2                        \n\t"\
2254
        " jnz 1b                        \n\t"\
2255
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2256
        : "r" ((long)srcStride)\
2257
        : "memory"\
2258
    );\
2259
    \
2260
    temp_ptr= temp;\
2261
    count=2;\
2262
    \
2263
/*FIXME reorder for speed */\
2264
    asm volatile(\
2265
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2266
        "1:                             \n\t"\
2267
        "movq (%0), %%mm0               \n\t"\
2268
        "movq 8(%0), %%mm1              \n\t"\
2269
        "movq 16(%0), %%mm2             \n\t"\
2270
        "movq 24(%0), %%mm3             \n\t"\
2271
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2272
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2273
        "add %4, %1                     \n\t"\
2274
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2275
        \
2276
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2277
        "add %4, %1                     \n\t"\
2278
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2279
        \
2280
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2281
        "add %4, %1                     \n\t"\
2282
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2283
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2284
                \
2285
        "add $72, %0                    \n\t"\
2286
        "add %6, %1                     \n\t"\
2287
        "decl %2                        \n\t"\
2288
        " jnz 1b                        \n\t"\
2289
         \
2290
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2291
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2292
        : "memory"\
2293
   );\
2294
}\
2295
\
2296
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2297
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2298
}\
2299
\
2300
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2301
    uint64_t temp[8];\
2302
    uint8_t * const half= (uint8_t*)temp;\
2303
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2304
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2305
}\
2306
\
2307
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2308
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2309
}\
2310
\
2311
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2312
    uint64_t temp[8];\
2313
    uint8_t * const half= (uint8_t*)temp;\
2314
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2315
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2316
}\
2317
\
2318
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2319
    uint64_t temp[8];\
2320
    uint8_t * const half= (uint8_t*)temp;\
2321
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2322
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2323
}\
2324
\
2325
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2326
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2327
}\
2328
\
2329
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2330
    uint64_t temp[8];\
2331
    uint8_t * const half= (uint8_t*)temp;\
2332
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2333
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2334
}\
2335
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2336
    uint64_t half[8 + 9];\
2337
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2338
    uint8_t * const halfHV= ((uint8_t*)half);\
2339
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2340
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2341
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2342
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2343
}\
2344
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2345
    uint64_t half[8 + 9];\
2346
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2347
    uint8_t * const halfHV= ((uint8_t*)half);\
2348
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2349
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2350
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2351
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2352
}\
2353
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2354
    uint64_t half[8 + 9];\
2355
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2356
    uint8_t * const halfHV= ((uint8_t*)half);\
2357
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2358
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2359
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2360
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2361
}\
2362
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2363
    uint64_t half[8 + 9];\
2364
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2365
    uint8_t * const halfHV= ((uint8_t*)half);\
2366
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2367
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2368
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2369
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2370
}\
2371
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2372
    uint64_t half[8 + 9];\
2373
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2374
    uint8_t * const halfHV= ((uint8_t*)half);\
2375
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2376
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2377
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2378
}\
2379
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2380
    uint64_t half[8 + 9];\
2381
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2382
    uint8_t * const halfHV= ((uint8_t*)half);\
2383
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2384
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2385
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2386
}\
2387
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2388
    uint64_t half[8 + 9];\
2389
    uint8_t * const halfH= ((uint8_t*)half);\
2390
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2391
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2392
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2393
}\
2394
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2395
    uint64_t half[8 + 9];\
2396
    uint8_t * const halfH= ((uint8_t*)half);\
2397
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2398
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2399
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2400
}\
2401
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2402
    uint64_t half[9];\
2403
    uint8_t * const halfH= ((uint8_t*)half);\
2404
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2405
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2406
}\
2407
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2408
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2409
}\
2410
\
2411
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2412
    uint64_t temp[32];\
2413
    uint8_t * const half= (uint8_t*)temp;\
2414
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2415
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2416
}\
2417
\
2418
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2419
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2420
}\
2421
\
2422
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2423
    uint64_t temp[32];\
2424
    uint8_t * const half= (uint8_t*)temp;\
2425
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2426
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2427
}\
2428
\
2429
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2430
    uint64_t temp[32];\
2431
    uint8_t * const half= (uint8_t*)temp;\
2432
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2433
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2434
}\
2435
\
2436
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2437
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2438
}\
2439
\
2440
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2441
    uint64_t temp[32];\
2442
    uint8_t * const half= (uint8_t*)temp;\
2443
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2444
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2445
}\
2446
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2447
    uint64_t half[16*2 + 17*2];\
2448
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2449
    uint8_t * const halfHV= ((uint8_t*)half);\
2450
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2451
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2452
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2453
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2454
}\
2455
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2456
    uint64_t half[16*2 + 17*2];\
2457
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2458
    uint8_t * const halfHV= ((uint8_t*)half);\
2459
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2460
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2461
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2462
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2463
}\
2464
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2465
    uint64_t half[16*2 + 17*2];\
2466
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2467
    uint8_t * const halfHV= ((uint8_t*)half);\
2468
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2469
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2470
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2471
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2472
}\
2473
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2474
    uint64_t half[16*2 + 17*2];\
2475
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2476
    uint8_t * const halfHV= ((uint8_t*)half);\
2477
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2478
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2479
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2480
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2481
}\
2482
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2483
    uint64_t half[16*2 + 17*2];\
2484
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2485
    uint8_t * const halfHV= ((uint8_t*)half);\
2486
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2487
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2488
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2489
}\
2490
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2491
    uint64_t half[16*2 + 17*2];\
2492
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2493
    uint8_t * const halfHV= ((uint8_t*)half);\
2494
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2495
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2496
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2497
}\
2498
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2499
    uint64_t half[17*2];\
2500
    uint8_t * const halfH= ((uint8_t*)half);\
2501
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2502
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2503
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2504
}\
2505
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2506
    uint64_t half[17*2];\
2507
    uint8_t * const halfH= ((uint8_t*)half);\
2508
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2509
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2510
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2511
}\
2512
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2513
    uint64_t half[17*2];\
2514
    uint8_t * const halfH= ((uint8_t*)half);\
2515
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2516
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2517
}
2518

    
2519
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2520
#define AVG_3DNOW_OP(a,b,temp, size) \
2521
"mov" #size " " #b ", " #temp "   \n\t"\
2522
"pavgusb " #temp ", " #a "        \n\t"\
2523
"mov" #size " " #a ", " #b "      \n\t"
2524
#define AVG_MMX2_OP(a,b,temp, size) \
2525
"mov" #size " " #b ", " #temp "   \n\t"\
2526
"pavgb " #temp ", " #a "          \n\t"\
2527
"mov" #size " " #a ", " #b "      \n\t"
2528

    
2529
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2530
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2531
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2532
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2533
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2534
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2535
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2536
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2537
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2538

    
2539
/***********************************/
2540
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2541

    
2542
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2543
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2544
    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2545
}
2546
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2547
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2548
    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2549
}
2550

    
2551
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
2552
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2553
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2554
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2555
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2556
                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2557
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2558
                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2559
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2560
                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2561
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2562
    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2563
}\
2564
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2565
    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2566
}\
2567
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
2568
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
2569
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
2570
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
2571
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
2572
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
2573
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
2574
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2575

    
2576
QPEL_2TAP(put_, 16, mmx2)
2577
QPEL_2TAP(avg_, 16, mmx2)
2578
QPEL_2TAP(put_,  8, mmx2)
2579
QPEL_2TAP(avg_,  8, mmx2)
2580
QPEL_2TAP(put_, 16, 3dnow)
2581
QPEL_2TAP(avg_, 16, 3dnow)
2582
QPEL_2TAP(put_,  8, 3dnow)
2583
QPEL_2TAP(avg_,  8, 3dnow)
2584

    
2585

    
2586
#if 0
2587
static void just_return() { return; }
2588
#endif
2589

    
2590
#define SET_QPEL_FUNC(postfix1, postfix2) \
2591
    c->put_ ## postfix1 = put_ ## postfix2;\
2592
    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2593
    c->avg_ ## postfix1 = avg_ ## postfix2;
2594

    
2595
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2596
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2597
    const int w = 8;
2598
    const int ix = ox>>(16+shift);
2599
    const int iy = oy>>(16+shift);
2600
    const int oxs = ox>>4;
2601
    const int oys = oy>>4;
2602
    const int dxxs = dxx>>4;
2603
    const int dxys = dxy>>4;
2604
    const int dyxs = dyx>>4;
2605
    const int dyys = dyy>>4;
2606
    const uint16_t r4[4] = {r,r,r,r};
2607
    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2608
    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2609
    const uint64_t shift2 = 2*shift;
2610
    uint8_t edge_buf[(h+1)*stride];
2611
    int x, y;
2612

    
2613
    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2614
    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2615
    const int dxh = dxy*(h-1);
2616
    const int dyw = dyx*(w-1);
2617
    if( // non-constant fullpel offset (3% of blocks)
2618
        (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2619
         oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2620
        // uses more than 16 bits of subpel mv (only at huge resolution)
2621
        || (dxx|dxy|dyx|dyy)&15 )
2622
    {
2623
        //FIXME could still use mmx for some of the rows
2624
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2625
        return;
2626
    }
2627

    
2628
    src += ix + iy*stride;
2629
    if( (unsigned)ix >= width-w ||
2630
        (unsigned)iy >= height-h )
2631
    {
2632
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2633
        src = edge_buf;
2634
    }
2635

    
2636
    asm volatile(
2637
        "movd         %0, %%mm6 \n\t"
2638
        "pxor      %%mm7, %%mm7 \n\t"
2639
        "punpcklwd %%mm6, %%mm6 \n\t"
2640
        "punpcklwd %%mm6, %%mm6 \n\t"
2641
        :: "r"(1<<shift)
2642
    );
2643

    
2644
    for(x=0; x<w; x+=4){
2645
        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2646
                            oxs - dxys + dxxs*(x+1),
2647
                            oxs - dxys + dxxs*(x+2),
2648
                            oxs - dxys + dxxs*(x+3) };
2649
        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2650
                            oys - dyys + dyxs*(x+1),
2651
                            oys - dyys + dyxs*(x+2),
2652
                            oys - dyys + dyxs*(x+3) };
2653

    
2654
        for(y=0; y<h; y++){
2655
            asm volatile(
2656
                "movq   %0,  %%mm4 \n\t"
2657
                "movq   %1,  %%mm5 \n\t"
2658
                "paddw  %2,  %%mm4 \n\t"
2659
                "paddw  %3,  %%mm5 \n\t"
2660
                "movq   %%mm4, %0  \n\t"
2661
                "movq   %%mm5, %1  \n\t"
2662
                "psrlw  $12, %%mm4 \n\t"
2663
                "psrlw  $12, %%mm5 \n\t"
2664
                : "+m"(*dx4), "+m"(*dy4)
2665
                : "m"(*dxy4), "m"(*dyy4)
2666
            );
2667

    
2668
            asm volatile(
2669
                "movq   %%mm6, %%mm2 \n\t"
2670
                "movq   %%mm6, %%mm1 \n\t"
2671
                "psubw  %%mm4, %%mm2 \n\t"
2672
                "psubw  %%mm5, %%mm1 \n\t"
2673
                "movq   %%mm2, %%mm0 \n\t"
2674
                "movq   %%mm4, %%mm3 \n\t"
2675
                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2676
                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2677
                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2678
                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2679

    
2680
                "movd   %4,    %%mm5 \n\t"
2681
                "movd   %3,    %%mm4 \n\t"
2682
                "punpcklbw %%mm7, %%mm5 \n\t"
2683
                "punpcklbw %%mm7, %%mm4 \n\t"
2684
                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2685
                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2686

    
2687
                "movd   %2,    %%mm5 \n\t"
2688
                "movd   %1,    %%mm4 \n\t"
2689
                "punpcklbw %%mm7, %%mm5 \n\t"
2690
                "punpcklbw %%mm7, %%mm4 \n\t"
2691
                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2692
                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2693
                "paddw  %5,    %%mm1 \n\t"
2694
                "paddw  %%mm3, %%mm2 \n\t"
2695
                "paddw  %%mm1, %%mm0 \n\t"
2696
                "paddw  %%mm2, %%mm0 \n\t"
2697

    
2698
                "psrlw    %6,    %%mm0 \n\t"
2699
                "packuswb %%mm0, %%mm0 \n\t"
2700
                "movd     %%mm0, %0    \n\t"
2701

    
2702
                : "=m"(dst[x+y*stride])
2703
                : "m"(src[0]), "m"(src[1]),
2704
                  "m"(src[stride]), "m"(src[stride+1]),
2705
                  "m"(*r4), "m"(shift2)
2706
            );
2707
            src += stride;
2708
        }
2709
        src += 4-h*stride;
2710
    }
2711
}
2712

    
2713
#ifdef CONFIG_ENCODERS
2714

    
2715
#define PHADDD(a, t)\
2716
    "movq "#a", "#t"                  \n\t"\
2717
    "psrlq $32, "#a"                  \n\t"\
2718
    "paddd "#t", "#a"                 \n\t"
2719
/*
2720
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2721
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2722
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2723
 */
2724
#define PMULHRW(x, y, s, o)\
2725
    "pmulhw " #s ", "#x "            \n\t"\
2726
    "pmulhw " #s ", "#y "            \n\t"\
2727
    "paddw " #o ", "#x "             \n\t"\
2728
    "paddw " #o ", "#y "             \n\t"\
2729
    "psraw $1, "#x "                 \n\t"\
2730
    "psraw $1, "#y "                 \n\t"
2731
#define DEF(x) x ## _mmx
2732
#define SET_RND MOVQ_WONE
2733
#define SCALE_OFFSET 1
2734

    
2735
#include "dsputil_mmx_qns.h"
2736

    
2737
#undef DEF
2738
#undef SET_RND
2739
#undef SCALE_OFFSET
2740
#undef PMULHRW
2741

    
2742
#define DEF(x) x ## _3dnow
2743
#define SET_RND(x)
2744
#define SCALE_OFFSET 0
2745
#define PMULHRW(x, y, s, o)\
2746
    "pmulhrw " #s ", "#x "           \n\t"\
2747
    "pmulhrw " #s ", "#y "           \n\t"
2748

    
2749
#include "dsputil_mmx_qns.h"
2750

    
2751
#undef DEF
2752
#undef SET_RND
2753
#undef SCALE_OFFSET
2754
#undef PMULHRW
2755

    
2756
#ifdef HAVE_SSSE3
2757
#undef PHADDD
2758
#define DEF(x) x ## _ssse3
2759
#define SET_RND(x)
2760
#define SCALE_OFFSET -1
2761
#define PHADDD(a, t)\
2762
    "pshufw $0x0E, "#a", "#t"         \n\t"\
2763
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
2764
#define PMULHRW(x, y, s, o)\
2765
    "pmulhrsw " #s ", "#x "          \n\t"\
2766
    "pmulhrsw " #s ", "#y "          \n\t"
2767

    
2768
#include "dsputil_mmx_qns.h"
2769

    
2770
#undef DEF
2771
#undef SET_RND
2772
#undef SCALE_OFFSET
2773
#undef PMULHRW
2774
#undef PHADDD
2775
#endif //HAVE_SSSE3
2776

    
2777
#endif /* CONFIG_ENCODERS */
2778

    
2779
#define PREFETCH(name, op) \
2780
static void name(void *mem, int stride, int h){\
2781
    const uint8_t *p= mem;\
2782
    do{\
2783
        asm volatile(#op" %0" :: "m"(*p));\
2784
        p+= stride;\
2785
    }while(--h);\
2786
}
2787
PREFETCH(prefetch_mmx2,  prefetcht0)
2788
PREFETCH(prefetch_3dnow, prefetch)
2789
#undef PREFETCH
2790

    
2791
#include "h264dsp_mmx.c"
2792

    
2793
/* CAVS specific */
2794
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2795

    
2796
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2797
    put_pixels8_mmx(dst, src, stride, 8);
2798
}
2799
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2800
    avg_pixels8_mmx(dst, src, stride, 8);
2801
}
2802
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2803
    put_pixels16_mmx(dst, src, stride, 16);
2804
}
2805
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2806
    avg_pixels16_mmx(dst, src, stride, 16);
2807
}
2808

    
2809
/* FLAC specific */
2810
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2811
                                   double *autoc);
2812

    
2813
/* VC1 specific */
2814
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2815

    
2816
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2817
    put_pixels8_mmx(dst, src, stride, 8);
2818
}
2819

    
2820
/* external functions, from idct_mmx.c */
2821
void ff_mmx_idct(DCTELEM *block);
2822
void ff_mmxext_idct(DCTELEM *block);
2823

    
2824
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2825
   converted */
2826
#ifdef CONFIG_GPL
2827
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2828
{
2829
    ff_mmx_idct (block);
2830
    put_pixels_clamped_mmx(block, dest, line_size);
2831
}
2832
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2833
{
2834
    ff_mmx_idct (block);
2835
    add_pixels_clamped_mmx(block, dest, line_size);
2836
}
2837
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2838
{
2839
    ff_mmxext_idct (block);
2840
    put_pixels_clamped_mmx(block, dest, line_size);
2841
}
2842
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2843
{
2844
    ff_mmxext_idct (block);
2845
    add_pixels_clamped_mmx(block, dest, line_size);
2846
}
2847
#endif
2848
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2849
{
2850
    ff_idct_xvid_mmx (block);
2851
    put_pixels_clamped_mmx(block, dest, line_size);
2852
}
2853
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2854
{
2855
    ff_idct_xvid_mmx (block);
2856
    add_pixels_clamped_mmx(block, dest, line_size);
2857
}
2858
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2859
{
2860
    ff_idct_xvid_mmx2 (block);
2861
    put_pixels_clamped_mmx(block, dest, line_size);
2862
}
2863
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2864
{
2865
    ff_idct_xvid_mmx2 (block);
2866
    add_pixels_clamped_mmx(block, dest, line_size);
2867
}
2868

    
2869
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2870
{
2871
    int i;
2872
    asm volatile("pxor %%mm7, %%mm7":);
2873
    for(i=0; i<blocksize; i+=2) {
2874
        asm volatile(
2875
            "movq    %0,    %%mm0 \n\t"
2876
            "movq    %1,    %%mm1 \n\t"
2877
            "movq    %%mm0, %%mm2 \n\t"
2878
            "movq    %%mm1, %%mm3 \n\t"
2879
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2880
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2881
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2882
            "pxor    %%mm2, %%mm1 \n\t"
2883
            "movq    %%mm3, %%mm4 \n\t"
2884
            "pand    %%mm1, %%mm3 \n\t"
2885
            "pandn   %%mm1, %%mm4 \n\t"
2886
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2887
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2888
            "movq    %%mm3, %1    \n\t"
2889
            "movq    %%mm0, %0    \n\t"
2890
            :"+m"(mag[i]), "+m"(ang[i])
2891
            ::"memory"
2892
        );
2893
    }
2894
    asm volatile("femms");
2895
}
2896
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2897
{
2898
    int i;
2899

    
2900
    asm volatile(
2901
            "movaps  %0,     %%xmm5 \n\t"
2902
        ::"m"(ff_pdw_80000000[0])
2903
    );
2904
    for(i=0; i<blocksize; i+=4) {
2905
        asm volatile(
2906
            "movaps  %0,     %%xmm0 \n\t"
2907
            "movaps  %1,     %%xmm1 \n\t"
2908
            "xorps   %%xmm2, %%xmm2 \n\t"
2909
            "xorps   %%xmm3, %%xmm3 \n\t"
2910
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2911
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2912
            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2913
            "xorps   %%xmm2, %%xmm1 \n\t"
2914
            "movaps  %%xmm3, %%xmm4 \n\t"
2915
            "andps   %%xmm1, %%xmm3 \n\t"
2916
            "andnps  %%xmm1, %%xmm4 \n\t"
2917
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2918
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2919
            "movaps  %%xmm3, %1     \n\t"
2920
            "movaps  %%xmm0, %0     \n\t"
2921
            :"+m"(mag[i]), "+m"(ang[i])
2922
            ::"memory"
2923
        );
2924
    }
2925
}
2926

    
2927
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2928
    long i = (len-4)*4;
2929
    asm volatile(
2930
        "1: \n\t"
2931
        "movq    (%1,%0), %%mm0 \n\t"
2932
        "movq   8(%1,%0), %%mm1 \n\t"
2933
        "pfmul   (%2,%0), %%mm0 \n\t"
2934
        "pfmul  8(%2,%0), %%mm1 \n\t"
2935
        "movq   %%mm0,  (%1,%0) \n\t"
2936
        "movq   %%mm1, 8(%1,%0) \n\t"
2937
        "sub  $16, %0 \n\t"
2938
        "jge 1b \n\t"
2939
        "femms  \n\t"
2940
        :"+r"(i)
2941
        :"r"(dst), "r"(src)
2942
        :"memory"
2943
    );
2944
}
2945
static void vector_fmul_sse(float *dst, const float *src, int len){
2946
    long i = (len-8)*4;
2947
    asm volatile(
2948
        "1: \n\t"
2949
        "movaps    (%1,%0), %%xmm0 \n\t"
2950
        "movaps  16(%1,%0), %%xmm1 \n\t"
2951
        "mulps     (%2,%0), %%xmm0 \n\t"
2952
        "mulps   16(%2,%0), %%xmm1 \n\t"
2953
        "movaps  %%xmm0,   (%1,%0) \n\t"
2954
        "movaps  %%xmm1, 16(%1,%0) \n\t"
2955
        "sub  $32, %0 \n\t"
2956
        "jge 1b \n\t"
2957
        :"+r"(i)
2958
        :"r"(dst), "r"(src)
2959
        :"memory"
2960
    );
2961
}
2962

    
2963
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2964
    long i = len*4-16;
2965
    asm volatile(
2966
        "1: \n\t"
2967
        "pswapd   8(%1), %%mm0 \n\t"
2968
        "pswapd    (%1), %%mm1 \n\t"
2969
        "pfmul  (%3,%0), %%mm0 \n\t"
2970
        "pfmul 8(%3,%0), %%mm1 \n\t"
2971
        "movq  %%mm0,  (%2,%0) \n\t"
2972
        "movq  %%mm1, 8(%2,%0) \n\t"
2973
        "add   $16, %1 \n\t"
2974
        "sub   $16, %0 \n\t"
2975
        "jge   1b \n\t"
2976
        :"+r"(i), "+r"(src1)
2977
        :"r"(dst), "r"(src0)
2978
    );
2979
    asm volatile("femms");
2980
}
2981
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2982
    long i = len*4-32;
2983
    asm volatile(
2984
        "1: \n\t"
2985
        "movaps        16(%1), %%xmm0 \n\t"
2986
        "movaps          (%1), %%xmm1 \n\t"
2987
        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2988
        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2989
        "mulps        (%3,%0), %%xmm0 \n\t"
2990
        "mulps      16(%3,%0), %%xmm1 \n\t"
2991
        "movaps     %%xmm0,   (%2,%0) \n\t"
2992
        "movaps     %%xmm1, 16(%2,%0) \n\t"
2993
        "add    $32, %1 \n\t"
2994
        "sub    $32, %0 \n\t"
2995
        "jge    1b \n\t"
2996
        :"+r"(i), "+r"(src1)
2997
        :"r"(dst), "r"(src0)
2998
    );
2999
}
3000

    
3001
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3002
                                      const float *src2, int src3, int len, int step){
3003
    long i = (len-4)*4;
3004
    if(step == 2 && src3 == 0){
3005
        dst += (len-4)*2;
3006
        asm volatile(
3007
            "1: \n\t"
3008
            "movq   (%2,%0),  %%mm0 \n\t"
3009
            "movq  8(%2,%0),  %%mm1 \n\t"
3010
            "pfmul  (%3,%0),  %%mm0 \n\t"
3011
            "pfmul 8(%3,%0),  %%mm1 \n\t"
3012
            "pfadd  (%4,%0),  %%mm0 \n\t"
3013
            "pfadd 8(%4,%0),  %%mm1 \n\t"
3014
            "movd     %%mm0,   (%1) \n\t"
3015
            "movd     %%mm1, 16(%1) \n\t"
3016
            "psrlq      $32,  %%mm0 \n\t"
3017
            "psrlq      $32,  %%mm1 \n\t"
3018
            "movd     %%mm0,  8(%1) \n\t"
3019
            "movd     %%mm1, 24(%1) \n\t"
3020
            "sub  $32, %1 \n\t"
3021
            "sub  $16, %0 \n\t"
3022
            "jge  1b \n\t"
3023
            :"+r"(i), "+r"(dst)
3024
            :"r"(src0), "r"(src1), "r"(src2)
3025
            :"memory"
3026
        );
3027
    }
3028
    else if(step == 1 && src3 == 0){
3029
        asm volatile(
3030
            "1: \n\t"
3031
            "movq    (%2,%0), %%mm0 \n\t"
3032
            "movq   8(%2,%0), %%mm1 \n\t"
3033
            "pfmul   (%3,%0), %%mm0 \n\t"
3034
            "pfmul  8(%3,%0), %%mm1 \n\t"
3035
            "pfadd   (%4,%0), %%mm0 \n\t"
3036
            "pfadd  8(%4,%0), %%mm1 \n\t"
3037
            "movq  %%mm0,   (%1,%0) \n\t"
3038
            "movq  %%mm1,  8(%1,%0) \n\t"
3039
            "sub  $16, %0 \n\t"
3040
            "jge  1b \n\t"
3041
            :"+r"(i)
3042
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3043
            :"memory"
3044
        );
3045
    }
3046
    else
3047
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3048
    asm volatile("femms");
3049
}
3050
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3051
                                    const float *src2, int src3, int len, int step){
3052
    long i = (len-8)*4;
3053
    if(step == 2 && src3 == 0){
3054
        dst += (len-8)*2;
3055
        asm volatile(
3056
            "1: \n\t"
3057
            "movaps   (%2,%0), %%xmm0 \n\t"
3058
            "movaps 16(%2,%0), %%xmm1 \n\t"
3059
            "mulps    (%3,%0), %%xmm0 \n\t"
3060
            "mulps  16(%3,%0), %%xmm1 \n\t"
3061
            "addps    (%4,%0), %%xmm0 \n\t"
3062
            "addps  16(%4,%0), %%xmm1 \n\t"
3063
            "movss     %%xmm0,   (%1) \n\t"
3064
            "movss     %%xmm1, 32(%1) \n\t"
3065
            "movhlps   %%xmm0, %%xmm2 \n\t"
3066
            "movhlps   %%xmm1, %%xmm3 \n\t"
3067
            "movss     %%xmm2, 16(%1) \n\t"
3068
            "movss     %%xmm3, 48(%1) \n\t"
3069
            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3070
            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3071
            "movss     %%xmm0,  8(%1) \n\t"
3072
            "movss     %%xmm1, 40(%1) \n\t"
3073
            "movhlps   %%xmm0, %%xmm2 \n\t"
3074
            "movhlps   %%xmm1, %%xmm3 \n\t"
3075
            "movss     %%xmm2, 24(%1) \n\t"
3076
            "movss     %%xmm3, 56(%1) \n\t"
3077
            "sub  $64, %1 \n\t"
3078
            "sub  $32, %0 \n\t"
3079
            "jge  1b \n\t"
3080
            :"+r"(i), "+r"(dst)
3081
            :"r"(src0), "r"(src1), "r"(src2)
3082
            :"memory"
3083
        );
3084
    }
3085
    else if(step == 1 && src3 == 0){
3086
        asm volatile(
3087
            "1: \n\t"
3088
            "movaps   (%2,%0), %%xmm0 \n\t"
3089
            "movaps 16(%2,%0), %%xmm1 \n\t"
3090
            "mulps    (%3,%0), %%xmm0 \n\t"
3091
            "mulps  16(%3,%0), %%xmm1 \n\t"
3092
            "addps    (%4,%0), %%xmm0 \n\t"
3093
            "addps  16(%4,%0), %%xmm1 \n\t"
3094
            "movaps %%xmm0,   (%1,%0) \n\t"
3095
            "movaps %%xmm1, 16(%1,%0) \n\t"
3096
            "sub  $32, %0 \n\t"
3097
            "jge  1b \n\t"
3098
            :"+r"(i)
3099
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3100
            :"memory"
3101
        );
3102
    }
3103
    else
3104
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3105
}
3106

    
3107
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3108
    // not bit-exact: pf2id uses different rounding than C and SSE
3109
    int i;
3110
    for(i=0; i<len; i+=4) {
3111
        asm volatile(
3112
            "pf2id       %1, %%mm0 \n\t"
3113
            "pf2id       %2, %%mm1 \n\t"
3114
            "packssdw %%mm1, %%mm0 \n\t"
3115
            "movq     %%mm0, %0    \n\t"
3116
            :"=m"(dst[i])
3117
            :"m"(src[i]), "m"(src[i+2])
3118
        );
3119
    }
3120
    asm volatile("femms");
3121
}
3122
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3123
    int i;
3124
    for(i=0; i<len; i+=4) {
3125
        asm volatile(
3126
            "cvtps2pi    %1, %%mm0 \n\t"
3127
            "cvtps2pi    %2, %%mm1 \n\t"
3128
            "packssdw %%mm1, %%mm0 \n\t"
3129
            "movq     %%mm0, %0    \n\t"
3130
            :"=m"(dst[i])
3131
            :"m"(src[i]), "m"(src[i+2])
3132
        );
3133
    }
3134
    asm volatile("emms");
3135
}
3136

    
3137
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3138
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3139
extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3140
extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3141
extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3142
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3143
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3144
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3145

    
3146
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3147
{
3148
    mm_flags = mm_support();
3149

    
3150
    if (avctx->dsp_mask) {
3151
        if (avctx->dsp_mask & FF_MM_FORCE)
3152
            mm_flags |= (avctx->dsp_mask & 0xffff);
3153
        else
3154
            mm_flags &= ~(avctx->dsp_mask & 0xffff);
3155
    }
3156

    
3157
#if 0
3158
    av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3159
    if (mm_flags & MM_MMX)
3160
        av_log(avctx, AV_LOG_INFO, " mmx");
3161
    if (mm_flags & MM_MMXEXT)
3162
        av_log(avctx, AV_LOG_INFO, " mmxext");
3163
    if (mm_flags & MM_3DNOW)
3164
        av_log(avctx, AV_LOG_INFO, " 3dnow");
3165
    if (mm_flags & MM_SSE)
3166
        av_log(avctx, AV_LOG_INFO, " sse");
3167
    if (mm_flags & MM_SSE2)
3168
        av_log(avctx, AV_LOG_INFO, " sse2");
3169
    av_log(avctx, AV_LOG_INFO, "\n");
3170
#endif
3171

    
3172
    if (mm_flags & MM_MMX) {
3173
        const int idct_algo= avctx->idct_algo;
3174

    
3175
#ifdef CONFIG_ENCODERS
3176
        const int dct_algo = avctx->dct_algo;
3177
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3178
            if(mm_flags & MM_SSE2){
3179
                c->fdct = ff_fdct_sse2;
3180
            }else if(mm_flags & MM_MMXEXT){
3181
                c->fdct = ff_fdct_mmx2;
3182
            }else{
3183
                c->fdct = ff_fdct_mmx;
3184
            }
3185
        }
3186
#endif //CONFIG_ENCODERS
3187
        if(avctx->lowres==0){
3188
            if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3189
                c->idct_put= ff_simple_idct_put_mmx;
3190
                c->idct_add= ff_simple_idct_add_mmx;
3191
                c->idct    = ff_simple_idct_mmx;
3192
                c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3193
#ifdef CONFIG_GPL
3194
            }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3195
                if(mm_flags & MM_MMXEXT){
3196
                    c->idct_put= ff_libmpeg2mmx2_idct_put;
3197
                    c->idct_add= ff_libmpeg2mmx2_idct_add;
3198
                    c->idct    = ff_mmxext_idct;
3199
                }else{
3200
                    c->idct_put= ff_libmpeg2mmx_idct_put;
3201
                    c->idct_add= ff_libmpeg2mmx_idct_add;
3202
                    c->idct    = ff_mmx_idct;
3203
                }
3204
                c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3205
#endif
3206
            }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
3207
                     idct_algo==FF_IDCT_VP3 &&
3208
                     avctx->codec->id!=CODEC_ID_THEORA &&
3209
                     !(avctx->flags & CODEC_FLAG_BITEXACT)){
3210
                if(mm_flags & MM_SSE2){
3211
                    c->idct_put= ff_vp3_idct_put_sse2;
3212
                    c->idct_add= ff_vp3_idct_add_sse2;
3213
                    c->idct    = ff_vp3_idct_sse2;
3214
                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3215
                }else{
3216
                    ff_vp3_dsp_init_mmx();
3217
                    c->idct_put= ff_vp3_idct_put_mmx;
3218
                    c->idct_add= ff_vp3_idct_add_mmx;
3219
                    c->idct    = ff_vp3_idct_mmx;
3220
                    c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3221
                }
3222
            }else if(idct_algo==FF_IDCT_CAVS){
3223
                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3224
            }else if(idct_algo==FF_IDCT_XVIDMMX){
3225
                if(mm_flags & MM_MMXEXT){
3226
                    c->idct_put= ff_idct_xvid_mmx2_put;
3227
                    c->idct_add= ff_idct_xvid_mmx2_add;
3228
                    c->idct    = ff_idct_xvid_mmx2;
3229
                }else{
3230
                    c->idct_put= ff_idct_xvid_mmx_put;
3231
                    c->idct_add= ff_idct_xvid_mmx_add;
3232
                    c->idct    = ff_idct_xvid_mmx;
3233
                }
3234
            }
3235
        }
3236

    
3237
#ifdef