Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ fa9b873e

History | View | Annotate | Download (132 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "dsputil.h"
26
#include "dsputil_mmx.h"
27
#include "simple_idct.h"
28
#include "mpegvideo.h"
29
#include "x86_cpu.h"
30
#include "mmx.h"
31
#include "vp3dsp_mmx.h"
32
#include "vp3dsp_sse2.h"
33
#include "h263.h"
34

    
35
//#undef NDEBUG
36
//#include <assert.h>
37

    
38
extern void ff_idct_xvid_mmx(short *block);
39
extern void ff_idct_xvid_mmx2(short *block);
40

    
41
int mm_flags; /* multimedia extension flags */
42

    
43
/* pixel operations */
44
DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45
DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
46

    
47
DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48
{0x8000000080000000ULL, 0x8000000080000000ULL};
49

    
50
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
51
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
52
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5  ) = 0x0005000500050005ULL;
53
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8  ) = 0x0008000800080008ULL;
54
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
56
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57
DECLARE_ALIGNED_16(const xmm_t,    ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
58
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61
DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62

    
63
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
64
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
65
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
66
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
67
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
68
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
69

    
70
DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71
DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
72

    
73
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
75

    
76
#define MOVQ_WONE(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79
    "psrlw $15, %%" #regd ::)
80

    
81
#define MOVQ_BFE(regd) \
82
    __asm __volatile ( \
83
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
85

    
86
#ifndef PIC
87
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
89
#else
90
// for shared library it's better to use this way for accessing constants
91
// pcmpeqd -> -1
92
#define MOVQ_BONE(regd) \
93
    __asm __volatile ( \
94
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95
    "psrlw $15, %%" #regd " \n\t" \
96
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
97

    
98
#define MOVQ_WTWO(regd) \
99
    __asm __volatile ( \
100
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101
    "psrlw $15, %%" #regd " \n\t" \
102
    "psllw $1, %%" #regd " \n\t"::)
103

    
104
#endif
105

    
106
// using regr as temporary and for the output result
107
// first argument is unmodifed and second is trashed
108
// regfe is supposed to contain 0xfefefefefefefefe
109
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110
    "movq " #rega ", " #regr "  \n\t"\
111
    "pand " #regb ", " #regr "  \n\t"\
112
    "pxor " #rega ", " #regb "  \n\t"\
113
    "pand " #regfe "," #regb "  \n\t"\
114
    "psrlq $1, " #regb "        \n\t"\
115
    "paddb " #regb ", " #regr " \n\t"
116

    
117
#define PAVGB_MMX(rega, regb, regr, regfe) \
118
    "movq " #rega ", " #regr "  \n\t"\
119
    "por  " #regb ", " #regr "  \n\t"\
120
    "pxor " #rega ", " #regb "  \n\t"\
121
    "pand " #regfe "," #regb "  \n\t"\
122
    "psrlq $1, " #regb "        \n\t"\
123
    "psubb " #regb ", " #regr " \n\t"
124

    
125
// mm6 is supposed to contain 0xfefefefefefefefe
126
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
127
    "movq " #rega ", " #regr "  \n\t"\
128
    "movq " #regc ", " #regp "  \n\t"\
129
    "pand " #regb ", " #regr "  \n\t"\
130
    "pand " #regd ", " #regp "  \n\t"\
131
    "pxor " #rega ", " #regb "  \n\t"\
132
    "pxor " #regc ", " #regd "  \n\t"\
133
    "pand %%mm6, " #regb "      \n\t"\
134
    "pand %%mm6, " #regd "      \n\t"\
135
    "psrlq $1, " #regb "        \n\t"\
136
    "psrlq $1, " #regd "        \n\t"\
137
    "paddb " #regb ", " #regr " \n\t"\
138
    "paddb " #regd ", " #regp " \n\t"
139

    
140
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141
    "movq " #rega ", " #regr "  \n\t"\
142
    "movq " #regc ", " #regp "  \n\t"\
143
    "por  " #regb ", " #regr "  \n\t"\
144
    "por  " #regd ", " #regp "  \n\t"\
145
    "pxor " #rega ", " #regb "  \n\t"\
146
    "pxor " #regc ", " #regd "  \n\t"\
147
    "pand %%mm6, " #regb "      \n\t"\
148
    "pand %%mm6, " #regd "      \n\t"\
149
    "psrlq $1, " #regd "        \n\t"\
150
    "psrlq $1, " #regb "        \n\t"\
151
    "psubb " #regb ", " #regr " \n\t"\
152
    "psubb " #regd ", " #regp " \n\t"
153

    
154
/***********************************/
155
/* MMX no rounding */
156
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157
#define SET_RND  MOVQ_WONE
158
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
160

    
161
#include "dsputil_mmx_rnd.h"
162

    
163
#undef DEF
164
#undef SET_RND
165
#undef PAVGBP
166
#undef PAVGB
167
/***********************************/
168
/* MMX rounding */
169

    
170
#define DEF(x, y) x ## _ ## y ##_mmx
171
#define SET_RND  MOVQ_WTWO
172
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
173
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
174

    
175
#include "dsputil_mmx_rnd.h"
176

    
177
#undef DEF
178
#undef SET_RND
179
#undef PAVGBP
180
#undef PAVGB
181

    
182
/***********************************/
183
/* 3Dnow specific */
184

    
185
#define DEF(x) x ## _3dnow
186
#define PAVGB "pavgusb"
187

    
188
#include "dsputil_mmx_avg.h"
189

    
190
#undef DEF
191
#undef PAVGB
192

    
193
/***********************************/
194
/* MMX2 specific */
195

    
196
#define DEF(x) x ## _mmx2
197

    
198
/* Introduced only in MMX2 set */
199
#define PAVGB "pavgb"
200

    
201
#include "dsputil_mmx_avg.h"
202

    
203
#undef DEF
204
#undef PAVGB
205

    
206
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
207
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
208
#define put_pixels16_mmx2 put_pixels16_mmx
209
#define put_pixels8_mmx2 put_pixels8_mmx
210
#define put_pixels4_mmx2 put_pixels4_mmx
211
#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
212
#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
213
#define put_pixels16_3dnow put_pixels16_mmx
214
#define put_pixels8_3dnow put_pixels8_mmx
215
#define put_pixels4_3dnow put_pixels4_mmx
216
#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
217
#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
218

    
219
/***********************************/
220
/* standard MMX */
221

    
222
#ifdef CONFIG_ENCODERS
223
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
224
{
225
    asm volatile(
226
        "mov $-128, %%"REG_a"           \n\t"
227
        "pxor %%mm7, %%mm7              \n\t"
228
        ASMALIGN(4)
229
        "1:                             \n\t"
230
        "movq (%0), %%mm0               \n\t"
231
        "movq (%0, %2), %%mm2           \n\t"
232
        "movq %%mm0, %%mm1              \n\t"
233
        "movq %%mm2, %%mm3              \n\t"
234
        "punpcklbw %%mm7, %%mm0         \n\t"
235
        "punpckhbw %%mm7, %%mm1         \n\t"
236
        "punpcklbw %%mm7, %%mm2         \n\t"
237
        "punpckhbw %%mm7, %%mm3         \n\t"
238
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
239
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
240
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
241
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
242
        "add %3, %0                     \n\t"
243
        "add $32, %%"REG_a"             \n\t"
244
        "js 1b                          \n\t"
245
        : "+r" (pixels)
246
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
247
        : "%"REG_a
248
    );
249
}
250

    
251
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
252
{
253
    asm volatile(
254
        "pxor %%mm7, %%mm7              \n\t"
255
        "mov $-128, %%"REG_a"           \n\t"
256
        ASMALIGN(4)
257
        "1:                             \n\t"
258
        "movq (%0), %%mm0               \n\t"
259
        "movq (%1), %%mm2               \n\t"
260
        "movq %%mm0, %%mm1              \n\t"
261
        "movq %%mm2, %%mm3              \n\t"
262
        "punpcklbw %%mm7, %%mm0         \n\t"
263
        "punpckhbw %%mm7, %%mm1         \n\t"
264
        "punpcklbw %%mm7, %%mm2         \n\t"
265
        "punpckhbw %%mm7, %%mm3         \n\t"
266
        "psubw %%mm2, %%mm0             \n\t"
267
        "psubw %%mm3, %%mm1             \n\t"
268
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
269
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
270
        "add %3, %0                     \n\t"
271
        "add %3, %1                     \n\t"
272
        "add $16, %%"REG_a"             \n\t"
273
        "jnz 1b                         \n\t"
274
        : "+r" (s1), "+r" (s2)
275
        : "r" (block+64), "r" ((long)stride)
276
        : "%"REG_a
277
    );
278
}
279
#endif //CONFIG_ENCODERS
280

    
281
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
282
{
283
    const DCTELEM *p;
284
    uint8_t *pix;
285

    
286
    /* read the pixels */
287
    p = block;
288
    pix = pixels;
289
    /* unrolled loop */
290
        __asm __volatile(
291
                "movq   %3, %%mm0               \n\t"
292
                "movq   8%3, %%mm1              \n\t"
293
                "movq   16%3, %%mm2             \n\t"
294
                "movq   24%3, %%mm3             \n\t"
295
                "movq   32%3, %%mm4             \n\t"
296
                "movq   40%3, %%mm5             \n\t"
297
                "movq   48%3, %%mm6             \n\t"
298
                "movq   56%3, %%mm7             \n\t"
299
                "packuswb %%mm1, %%mm0          \n\t"
300
                "packuswb %%mm3, %%mm2          \n\t"
301
                "packuswb %%mm5, %%mm4          \n\t"
302
                "packuswb %%mm7, %%mm6          \n\t"
303
                "movq   %%mm0, (%0)             \n\t"
304
                "movq   %%mm2, (%0, %1)         \n\t"
305
                "movq   %%mm4, (%0, %1, 2)      \n\t"
306
                "movq   %%mm6, (%0, %2)         \n\t"
307
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
308
                :"memory");
309
        pix += line_size*4;
310
        p += 32;
311

    
312
    // if here would be an exact copy of the code above
313
    // compiler would generate some very strange code
314
    // thus using "r"
315
    __asm __volatile(
316
            "movq       (%3), %%mm0             \n\t"
317
            "movq       8(%3), %%mm1            \n\t"
318
            "movq       16(%3), %%mm2           \n\t"
319
            "movq       24(%3), %%mm3           \n\t"
320
            "movq       32(%3), %%mm4           \n\t"
321
            "movq       40(%3), %%mm5           \n\t"
322
            "movq       48(%3), %%mm6           \n\t"
323
            "movq       56(%3), %%mm7           \n\t"
324
            "packuswb %%mm1, %%mm0              \n\t"
325
            "packuswb %%mm3, %%mm2              \n\t"
326
            "packuswb %%mm5, %%mm4              \n\t"
327
            "packuswb %%mm7, %%mm6              \n\t"
328
            "movq       %%mm0, (%0)             \n\t"
329
            "movq       %%mm2, (%0, %1)         \n\t"
330
            "movq       %%mm4, (%0, %1, 2)      \n\t"
331
            "movq       %%mm6, (%0, %2)         \n\t"
332
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
333
            :"memory");
334
}
335

    
336
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
337
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
338

    
339
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
340
{
341
    int i;
342

    
343
    movq_m2r(*vector128, mm1);
344
    for (i = 0; i < 8; i++) {
345
        movq_m2r(*(block), mm0);
346
        packsswb_m2r(*(block + 4), mm0);
347
        block += 8;
348
        paddb_r2r(mm1, mm0);
349
        movq_r2m(mm0, *pixels);
350
        pixels += line_size;
351
    }
352
}
353

    
354
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
355
{
356
    const DCTELEM *p;
357
    uint8_t *pix;
358
    int i;
359

    
360
    /* read the pixels */
361
    p = block;
362
    pix = pixels;
363
    MOVQ_ZERO(mm7);
364
    i = 4;
365
    do {
366
        __asm __volatile(
367
                "movq   (%2), %%mm0     \n\t"
368
                "movq   8(%2), %%mm1    \n\t"
369
                "movq   16(%2), %%mm2   \n\t"
370
                "movq   24(%2), %%mm3   \n\t"
371
                "movq   %0, %%mm4       \n\t"
372
                "movq   %1, %%mm6       \n\t"
373
                "movq   %%mm4, %%mm5    \n\t"
374
                "punpcklbw %%mm7, %%mm4 \n\t"
375
                "punpckhbw %%mm7, %%mm5 \n\t"
376
                "paddsw %%mm4, %%mm0    \n\t"
377
                "paddsw %%mm5, %%mm1    \n\t"
378
                "movq   %%mm6, %%mm5    \n\t"
379
                "punpcklbw %%mm7, %%mm6 \n\t"
380
                "punpckhbw %%mm7, %%mm5 \n\t"
381
                "paddsw %%mm6, %%mm2    \n\t"
382
                "paddsw %%mm5, %%mm3    \n\t"
383
                "packuswb %%mm1, %%mm0  \n\t"
384
                "packuswb %%mm3, %%mm2  \n\t"
385
                "movq   %%mm0, %0       \n\t"
386
                "movq   %%mm2, %1       \n\t"
387
                :"+m"(*pix), "+m"(*(pix+line_size))
388
                :"r"(p)
389
                :"memory");
390
        pix += line_size*2;
391
        p += 16;
392
    } while (--i);
393
}
394

    
395
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
396
{
397
    __asm __volatile(
398
         "lea (%3, %3), %%"REG_a"       \n\t"
399
         ASMALIGN(3)
400
         "1:                            \n\t"
401
         "movd (%1), %%mm0              \n\t"
402
         "movd (%1, %3), %%mm1          \n\t"
403
         "movd %%mm0, (%2)              \n\t"
404
         "movd %%mm1, (%2, %3)          \n\t"
405
         "add %%"REG_a", %1             \n\t"
406
         "add %%"REG_a", %2             \n\t"
407
         "movd (%1), %%mm0              \n\t"
408
         "movd (%1, %3), %%mm1          \n\t"
409
         "movd %%mm0, (%2)              \n\t"
410
         "movd %%mm1, (%2, %3)          \n\t"
411
         "add %%"REG_a", %1             \n\t"
412
         "add %%"REG_a", %2             \n\t"
413
         "subl $4, %0                   \n\t"
414
         "jnz 1b                        \n\t"
415
         : "+g"(h), "+r" (pixels),  "+r" (block)
416
         : "r"((long)line_size)
417
         : "%"REG_a, "memory"
418
        );
419
}
420

    
421
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
422
{
423
    __asm __volatile(
424
         "lea (%3, %3), %%"REG_a"       \n\t"
425
         ASMALIGN(3)
426
         "1:                            \n\t"
427
         "movq (%1), %%mm0              \n\t"
428
         "movq (%1, %3), %%mm1          \n\t"
429
         "movq %%mm0, (%2)              \n\t"
430
         "movq %%mm1, (%2, %3)          \n\t"
431
         "add %%"REG_a", %1             \n\t"
432
         "add %%"REG_a", %2             \n\t"
433
         "movq (%1), %%mm0              \n\t"
434
         "movq (%1, %3), %%mm1          \n\t"
435
         "movq %%mm0, (%2)              \n\t"
436
         "movq %%mm1, (%2, %3)          \n\t"
437
         "add %%"REG_a", %1             \n\t"
438
         "add %%"REG_a", %2             \n\t"
439
         "subl $4, %0                   \n\t"
440
         "jnz 1b                        \n\t"
441
         : "+g"(h), "+r" (pixels),  "+r" (block)
442
         : "r"((long)line_size)
443
         : "%"REG_a, "memory"
444
        );
445
}
446

    
447
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
448
{
449
    __asm __volatile(
450
         "lea (%3, %3), %%"REG_a"       \n\t"
451
         ASMALIGN(3)
452
         "1:                            \n\t"
453
         "movq (%1), %%mm0              \n\t"
454
         "movq 8(%1), %%mm4             \n\t"
455
         "movq (%1, %3), %%mm1          \n\t"
456
         "movq 8(%1, %3), %%mm5         \n\t"
457
         "movq %%mm0, (%2)              \n\t"
458
         "movq %%mm4, 8(%2)             \n\t"
459
         "movq %%mm1, (%2, %3)          \n\t"
460
         "movq %%mm5, 8(%2, %3)         \n\t"
461
         "add %%"REG_a", %1             \n\t"
462
         "add %%"REG_a", %2             \n\t"
463
         "movq (%1), %%mm0              \n\t"
464
         "movq 8(%1), %%mm4             \n\t"
465
         "movq (%1, %3), %%mm1          \n\t"
466
         "movq 8(%1, %3), %%mm5         \n\t"
467
         "movq %%mm0, (%2)              \n\t"
468
         "movq %%mm4, 8(%2)             \n\t"
469
         "movq %%mm1, (%2, %3)          \n\t"
470
         "movq %%mm5, 8(%2, %3)         \n\t"
471
         "add %%"REG_a", %1             \n\t"
472
         "add %%"REG_a", %2             \n\t"
473
         "subl $4, %0                   \n\t"
474
         "jnz 1b                        \n\t"
475
         : "+g"(h), "+r" (pixels),  "+r" (block)
476
         : "r"((long)line_size)
477
         : "%"REG_a, "memory"
478
        );
479
}
480

    
481
static void clear_blocks_mmx(DCTELEM *blocks)
482
{
483
    __asm __volatile(
484
                "pxor %%mm7, %%mm7              \n\t"
485
                "mov $-128*6, %%"REG_a"         \n\t"
486
                "1:                             \n\t"
487
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
488
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
489
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
490
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
491
                "add $32, %%"REG_a"             \n\t"
492
                " js 1b                         \n\t"
493
                : : "r" (((uint8_t *)blocks)+128*6)
494
                : "%"REG_a
495
        );
496
}
497

    
498
#ifdef CONFIG_ENCODERS
499
static int pix_sum16_mmx(uint8_t * pix, int line_size){
500
    const int h=16;
501
    int sum;
502
    long index= -line_size*h;
503

    
504
    __asm __volatile(
505
                "pxor %%mm7, %%mm7              \n\t"
506
                "pxor %%mm6, %%mm6              \n\t"
507
                "1:                             \n\t"
508
                "movq (%2, %1), %%mm0           \n\t"
509
                "movq (%2, %1), %%mm1           \n\t"
510
                "movq 8(%2, %1), %%mm2          \n\t"
511
                "movq 8(%2, %1), %%mm3          \n\t"
512
                "punpcklbw %%mm7, %%mm0         \n\t"
513
                "punpckhbw %%mm7, %%mm1         \n\t"
514
                "punpcklbw %%mm7, %%mm2         \n\t"
515
                "punpckhbw %%mm7, %%mm3         \n\t"
516
                "paddw %%mm0, %%mm1             \n\t"
517
                "paddw %%mm2, %%mm3             \n\t"
518
                "paddw %%mm1, %%mm3             \n\t"
519
                "paddw %%mm3, %%mm6             \n\t"
520
                "add %3, %1                     \n\t"
521
                " js 1b                         \n\t"
522
                "movq %%mm6, %%mm5              \n\t"
523
                "psrlq $32, %%mm6               \n\t"
524
                "paddw %%mm5, %%mm6             \n\t"
525
                "movq %%mm6, %%mm5              \n\t"
526
                "psrlq $16, %%mm6               \n\t"
527
                "paddw %%mm5, %%mm6             \n\t"
528
                "movd %%mm6, %0                 \n\t"
529
                "andl $0xFFFF, %0               \n\t"
530
                : "=&r" (sum), "+r" (index)
531
                : "r" (pix - index), "r" ((long)line_size)
532
        );
533

    
534
        return sum;
535
}
536
#endif //CONFIG_ENCODERS
537

    
538
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
539
    long i=0;
540
    asm volatile(
541
        "1:                             \n\t"
542
        "movq  (%1, %0), %%mm0          \n\t"
543
        "movq  (%2, %0), %%mm1          \n\t"
544
        "paddb %%mm0, %%mm1             \n\t"
545
        "movq %%mm1, (%2, %0)           \n\t"
546
        "movq 8(%1, %0), %%mm0          \n\t"
547
        "movq 8(%2, %0), %%mm1          \n\t"
548
        "paddb %%mm0, %%mm1             \n\t"
549
        "movq %%mm1, 8(%2, %0)          \n\t"
550
        "add $16, %0                    \n\t"
551
        "cmp %3, %0                     \n\t"
552
        " jb 1b                         \n\t"
553
        : "+r" (i)
554
        : "r"(src), "r"(dst), "r"((long)w-15)
555
    );
556
    for(; i<w; i++)
557
        dst[i+0] += src[i+0];
558
}
559

    
560
#define H263_LOOP_FILTER \
561
        "pxor %%mm7, %%mm7              \n\t"\
562
        "movq  %0, %%mm0                \n\t"\
563
        "movq  %0, %%mm1                \n\t"\
564
        "movq  %3, %%mm2                \n\t"\
565
        "movq  %3, %%mm3                \n\t"\
566
        "punpcklbw %%mm7, %%mm0         \n\t"\
567
        "punpckhbw %%mm7, %%mm1         \n\t"\
568
        "punpcklbw %%mm7, %%mm2         \n\t"\
569
        "punpckhbw %%mm7, %%mm3         \n\t"\
570
        "psubw %%mm2, %%mm0             \n\t"\
571
        "psubw %%mm3, %%mm1             \n\t"\
572
        "movq  %1, %%mm2                \n\t"\
573
        "movq  %1, %%mm3                \n\t"\
574
        "movq  %2, %%mm4                \n\t"\
575
        "movq  %2, %%mm5                \n\t"\
576
        "punpcklbw %%mm7, %%mm2         \n\t"\
577
        "punpckhbw %%mm7, %%mm3         \n\t"\
578
        "punpcklbw %%mm7, %%mm4         \n\t"\
579
        "punpckhbw %%mm7, %%mm5         \n\t"\
580
        "psubw %%mm2, %%mm4             \n\t"\
581
        "psubw %%mm3, %%mm5             \n\t"\
582
        "psllw $2, %%mm4                \n\t"\
583
        "psllw $2, %%mm5                \n\t"\
584
        "paddw %%mm0, %%mm4             \n\t"\
585
        "paddw %%mm1, %%mm5             \n\t"\
586
        "pxor %%mm6, %%mm6              \n\t"\
587
        "pcmpgtw %%mm4, %%mm6           \n\t"\
588
        "pcmpgtw %%mm5, %%mm7           \n\t"\
589
        "pxor %%mm6, %%mm4              \n\t"\
590
        "pxor %%mm7, %%mm5              \n\t"\
591
        "psubw %%mm6, %%mm4             \n\t"\
592
        "psubw %%mm7, %%mm5             \n\t"\
593
        "psrlw $3, %%mm4                \n\t"\
594
        "psrlw $3, %%mm5                \n\t"\
595
        "packuswb %%mm5, %%mm4          \n\t"\
596
        "packsswb %%mm7, %%mm6          \n\t"\
597
        "pxor %%mm7, %%mm7              \n\t"\
598
        "movd %4, %%mm2                 \n\t"\
599
        "punpcklbw %%mm2, %%mm2         \n\t"\
600
        "punpcklbw %%mm2, %%mm2         \n\t"\
601
        "punpcklbw %%mm2, %%mm2         \n\t"\
602
        "psubusb %%mm4, %%mm2           \n\t"\
603
        "movq %%mm2, %%mm3              \n\t"\
604
        "psubusb %%mm4, %%mm3           \n\t"\
605
        "psubb %%mm3, %%mm2             \n\t"\
606
        "movq %1, %%mm3                 \n\t"\
607
        "movq %2, %%mm4                 \n\t"\
608
        "pxor %%mm6, %%mm3              \n\t"\
609
        "pxor %%mm6, %%mm4              \n\t"\
610
        "paddusb %%mm2, %%mm3           \n\t"\
611
        "psubusb %%mm2, %%mm4           \n\t"\
612
        "pxor %%mm6, %%mm3              \n\t"\
613
        "pxor %%mm6, %%mm4              \n\t"\
614
        "paddusb %%mm2, %%mm2           \n\t"\
615
        "packsswb %%mm1, %%mm0          \n\t"\
616
        "pcmpgtb %%mm0, %%mm7           \n\t"\
617
        "pxor %%mm7, %%mm0              \n\t"\
618
        "psubb %%mm7, %%mm0             \n\t"\
619
        "movq %%mm0, %%mm1              \n\t"\
620
        "psubusb %%mm2, %%mm0           \n\t"\
621
        "psubb %%mm0, %%mm1             \n\t"\
622
        "pand %5, %%mm1                 \n\t"\
623
        "psrlw $2, %%mm1                \n\t"\
624
        "pxor %%mm7, %%mm1              \n\t"\
625
        "psubb %%mm7, %%mm1             \n\t"\
626
        "movq %0, %%mm5                 \n\t"\
627
        "movq %3, %%mm6                 \n\t"\
628
        "psubb %%mm1, %%mm5             \n\t"\
629
        "paddb %%mm1, %%mm6             \n\t"
630

    
631
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
632
    if(ENABLE_ANY_H263) {
633
    const int strength= ff_h263_loop_filter_strength[qscale];
634

    
635
    asm volatile(
636

    
637
        H263_LOOP_FILTER
638

    
639
        "movq %%mm3, %1                 \n\t"
640
        "movq %%mm4, %2                 \n\t"
641
        "movq %%mm5, %0                 \n\t"
642
        "movq %%mm6, %3                 \n\t"
643
        : "+m" (*(uint64_t*)(src - 2*stride)),
644
          "+m" (*(uint64_t*)(src - 1*stride)),
645
          "+m" (*(uint64_t*)(src + 0*stride)),
646
          "+m" (*(uint64_t*)(src + 1*stride))
647
        : "g" (2*strength), "m"(ff_pb_FC)
648
    );
649
    }
650
}
651

    
652
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
653
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
654
        "movd  %4, %%mm0                \n\t"
655
        "movd  %5, %%mm1                \n\t"
656
        "movd  %6, %%mm2                \n\t"
657
        "movd  %7, %%mm3                \n\t"
658
        "punpcklbw %%mm1, %%mm0         \n\t"
659
        "punpcklbw %%mm3, %%mm2         \n\t"
660
        "movq %%mm0, %%mm1              \n\t"
661
        "punpcklwd %%mm2, %%mm0         \n\t"
662
        "punpckhwd %%mm2, %%mm1         \n\t"
663
        "movd  %%mm0, %0                \n\t"
664
        "punpckhdq %%mm0, %%mm0         \n\t"
665
        "movd  %%mm0, %1                \n\t"
666
        "movd  %%mm1, %2                \n\t"
667
        "punpckhdq %%mm1, %%mm1         \n\t"
668
        "movd  %%mm1, %3                \n\t"
669

    
670
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
671
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
672
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
673
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
674
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
675
           "m" (*(uint32_t*)(src + 1*src_stride)),
676
           "m" (*(uint32_t*)(src + 2*src_stride)),
677
           "m" (*(uint32_t*)(src + 3*src_stride))
678
    );
679
}
680

    
681
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
682
    if(ENABLE_ANY_H263) {
683
    const int strength= ff_h263_loop_filter_strength[qscale];
684
    DECLARE_ALIGNED(8, uint64_t, temp[4]);
685
    uint8_t *btemp= (uint8_t*)temp;
686

    
687
    src -= 2;
688

    
689
    transpose4x4(btemp  , src           , 8, stride);
690
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
691
    asm volatile(
692
        H263_LOOP_FILTER // 5 3 4 6
693

    
694
        : "+m" (temp[0]),
695
          "+m" (temp[1]),
696
          "+m" (temp[2]),
697
          "+m" (temp[3])
698
        : "g" (2*strength), "m"(ff_pb_FC)
699
    );
700

    
701
    asm volatile(
702
        "movq %%mm5, %%mm1              \n\t"
703
        "movq %%mm4, %%mm0              \n\t"
704
        "punpcklbw %%mm3, %%mm5         \n\t"
705
        "punpcklbw %%mm6, %%mm4         \n\t"
706
        "punpckhbw %%mm3, %%mm1         \n\t"
707
        "punpckhbw %%mm6, %%mm0         \n\t"
708
        "movq %%mm5, %%mm3              \n\t"
709
        "movq %%mm1, %%mm6              \n\t"
710
        "punpcklwd %%mm4, %%mm5         \n\t"
711
        "punpcklwd %%mm0, %%mm1         \n\t"
712
        "punpckhwd %%mm4, %%mm3         \n\t"
713
        "punpckhwd %%mm0, %%mm6         \n\t"
714
        "movd %%mm5, (%0)               \n\t"
715
        "punpckhdq %%mm5, %%mm5         \n\t"
716
        "movd %%mm5, (%0,%2)            \n\t"
717
        "movd %%mm3, (%0,%2,2)          \n\t"
718
        "punpckhdq %%mm3, %%mm3         \n\t"
719
        "movd %%mm3, (%0,%3)            \n\t"
720
        "movd %%mm1, (%1)               \n\t"
721
        "punpckhdq %%mm1, %%mm1         \n\t"
722
        "movd %%mm1, (%1,%2)            \n\t"
723
        "movd %%mm6, (%1,%2,2)          \n\t"
724
        "punpckhdq %%mm6, %%mm6         \n\t"
725
        "movd %%mm6, (%1,%3)            \n\t"
726
        :: "r" (src),
727
           "r" (src + 4*stride),
728
           "r" ((long)   stride ),
729
           "r" ((long)(3*stride))
730
    );
731
    }
732
}
733

    
734
#ifdef CONFIG_ENCODERS
735
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
736
    int tmp;
737
  asm volatile (
738
      "movl $16,%%ecx\n"
739
      "pxor %%mm0,%%mm0\n"
740
      "pxor %%mm7,%%mm7\n"
741
      "1:\n"
742
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
743
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
744

    
745
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
746

    
747
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
748
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
749

    
750
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
751
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
752
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
753

    
754
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
755
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
756

    
757
      "pmaddwd %%mm3,%%mm3\n"
758
      "pmaddwd %%mm4,%%mm4\n"
759

    
760
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
761
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
762
      "paddd %%mm3,%%mm4\n"
763
      "paddd %%mm2,%%mm7\n"
764

    
765
      "add %2, %0\n"
766
      "paddd %%mm4,%%mm7\n"
767
      "dec %%ecx\n"
768
      "jnz 1b\n"
769

    
770
      "movq %%mm7,%%mm1\n"
771
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
772
      "paddd %%mm7,%%mm1\n"
773
      "movd %%mm1,%1\n"
774
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
775
    return tmp;
776
}
777

    
778
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
779
    int tmp;
780
  asm volatile (
781
      "movl %4,%%ecx\n"
782
      "shr $1,%%ecx\n"
783
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
784
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
785
      "1:\n"
786
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
787
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
788
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
789
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
790

    
791
      /* todo: mm1-mm2, mm3-mm4 */
792
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
793
      /*       OR the results to get absolute difference */
794
      "movq %%mm1,%%mm5\n"
795
      "movq %%mm3,%%mm6\n"
796
      "psubusb %%mm2,%%mm1\n"
797
      "psubusb %%mm4,%%mm3\n"
798
      "psubusb %%mm5,%%mm2\n"
799
      "psubusb %%mm6,%%mm4\n"
800

    
801
      "por %%mm1,%%mm2\n"
802
      "por %%mm3,%%mm4\n"
803

    
804
      /* now convert to 16-bit vectors so we can square them */
805
      "movq %%mm2,%%mm1\n"
806
      "movq %%mm4,%%mm3\n"
807

    
808
      "punpckhbw %%mm0,%%mm2\n"
809
      "punpckhbw %%mm0,%%mm4\n"
810
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
811
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
812

    
813
      "pmaddwd %%mm2,%%mm2\n"
814
      "pmaddwd %%mm4,%%mm4\n"
815
      "pmaddwd %%mm1,%%mm1\n"
816
      "pmaddwd %%mm3,%%mm3\n"
817

    
818
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
819
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
820

    
821
      "paddd %%mm2,%%mm1\n"
822
      "paddd %%mm4,%%mm3\n"
823
      "paddd %%mm1,%%mm7\n"
824
      "paddd %%mm3,%%mm7\n"
825

    
826
      "decl %%ecx\n"
827
      "jnz 1b\n"
828

    
829
      "movq %%mm7,%%mm1\n"
830
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
831
      "paddd %%mm7,%%mm1\n"
832
      "movd %%mm1,%2\n"
833
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
834
      : "r" ((long)line_size) , "m" (h)
835
      : "%ecx");
836
    return tmp;
837
}
838

    
839
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
840
    int tmp;
841
  asm volatile (
842
      "movl %4,%%ecx\n"
843
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
844
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
845
      "1:\n"
846
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
847
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
848
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
849
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
850

    
851
      /* todo: mm1-mm2, mm3-mm4 */
852
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
853
      /*       OR the results to get absolute difference */
854
      "movq %%mm1,%%mm5\n"
855
      "movq %%mm3,%%mm6\n"
856
      "psubusb %%mm2,%%mm1\n"
857
      "psubusb %%mm4,%%mm3\n"
858
      "psubusb %%mm5,%%mm2\n"
859
      "psubusb %%mm6,%%mm4\n"
860

    
861
      "por %%mm1,%%mm2\n"
862
      "por %%mm3,%%mm4\n"
863

    
864
      /* now convert to 16-bit vectors so we can square them */
865
      "movq %%mm2,%%mm1\n"
866
      "movq %%mm4,%%mm3\n"
867

    
868
      "punpckhbw %%mm0,%%mm2\n"
869
      "punpckhbw %%mm0,%%mm4\n"
870
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
871
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
872

    
873
      "pmaddwd %%mm2,%%mm2\n"
874
      "pmaddwd %%mm4,%%mm4\n"
875
      "pmaddwd %%mm1,%%mm1\n"
876
      "pmaddwd %%mm3,%%mm3\n"
877

    
878
      "add %3,%0\n"
879
      "add %3,%1\n"
880

    
881
      "paddd %%mm2,%%mm1\n"
882
      "paddd %%mm4,%%mm3\n"
883
      "paddd %%mm1,%%mm7\n"
884
      "paddd %%mm3,%%mm7\n"
885

    
886
      "decl %%ecx\n"
887
      "jnz 1b\n"
888

    
889
      "movq %%mm7,%%mm1\n"
890
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
891
      "paddd %%mm7,%%mm1\n"
892
      "movd %%mm1,%2\n"
893
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
894
      : "r" ((long)line_size) , "m" (h)
895
      : "%ecx");
896
    return tmp;
897
}
898

    
899
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
900
    int tmp;
901
  asm volatile (
902
      "shr $1,%2\n"
903
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
904
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
905
      "1:\n"
906
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
907
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
908
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
909
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
910

    
911
      /* todo: mm1-mm2, mm3-mm4 */
912
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
913
      /*       OR the results to get absolute difference */
914
      "movdqa %%xmm1,%%xmm5\n"
915
      "movdqa %%xmm3,%%xmm6\n"
916
      "psubusb %%xmm2,%%xmm1\n"
917
      "psubusb %%xmm4,%%xmm3\n"
918
      "psubusb %%xmm5,%%xmm2\n"
919
      "psubusb %%xmm6,%%xmm4\n"
920

    
921
      "por %%xmm1,%%xmm2\n"
922
      "por %%xmm3,%%xmm4\n"
923

    
924
      /* now convert to 16-bit vectors so we can square them */
925
      "movdqa %%xmm2,%%xmm1\n"
926
      "movdqa %%xmm4,%%xmm3\n"
927

    
928
      "punpckhbw %%xmm0,%%xmm2\n"
929
      "punpckhbw %%xmm0,%%xmm4\n"
930
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
931
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
932

    
933
      "pmaddwd %%xmm2,%%xmm2\n"
934
      "pmaddwd %%xmm4,%%xmm4\n"
935
      "pmaddwd %%xmm1,%%xmm1\n"
936
      "pmaddwd %%xmm3,%%xmm3\n"
937

    
938
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
939
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
940

    
941
      "paddd %%xmm2,%%xmm1\n"
942
      "paddd %%xmm4,%%xmm3\n"
943
      "paddd %%xmm1,%%xmm7\n"
944
      "paddd %%xmm3,%%xmm7\n"
945

    
946
      "decl %2\n"
947
      "jnz 1b\n"
948

    
949
      "movdqa %%xmm7,%%xmm1\n"
950
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
951
      "paddd %%xmm1,%%xmm7\n"
952
      "movdqa %%xmm7,%%xmm1\n"
953
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
954
      "paddd %%xmm1,%%xmm7\n"
955
      "movd %%xmm7,%3\n"
956
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
957
      : "r" ((long)line_size));
958
    return tmp;
959
}
960

    
961
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
962
    int tmp;
963
  asm volatile (
964
      "movl %3,%%ecx\n"
965
      "pxor %%mm7,%%mm7\n"
966
      "pxor %%mm6,%%mm6\n"
967

    
968
      "movq (%0),%%mm0\n"
969
      "movq %%mm0, %%mm1\n"
970
      "psllq $8, %%mm0\n"
971
      "psrlq $8, %%mm1\n"
972
      "psrlq $8, %%mm0\n"
973
      "movq %%mm0, %%mm2\n"
974
      "movq %%mm1, %%mm3\n"
975
      "punpcklbw %%mm7,%%mm0\n"
976
      "punpcklbw %%mm7,%%mm1\n"
977
      "punpckhbw %%mm7,%%mm2\n"
978
      "punpckhbw %%mm7,%%mm3\n"
979
      "psubw %%mm1, %%mm0\n"
980
      "psubw %%mm3, %%mm2\n"
981

    
982
      "add %2,%0\n"
983

    
984
      "movq (%0),%%mm4\n"
985
      "movq %%mm4, %%mm1\n"
986
      "psllq $8, %%mm4\n"
987
      "psrlq $8, %%mm1\n"
988
      "psrlq $8, %%mm4\n"
989
      "movq %%mm4, %%mm5\n"
990
      "movq %%mm1, %%mm3\n"
991
      "punpcklbw %%mm7,%%mm4\n"
992
      "punpcklbw %%mm7,%%mm1\n"
993
      "punpckhbw %%mm7,%%mm5\n"
994
      "punpckhbw %%mm7,%%mm3\n"
995
      "psubw %%mm1, %%mm4\n"
996
      "psubw %%mm3, %%mm5\n"
997
      "psubw %%mm4, %%mm0\n"
998
      "psubw %%mm5, %%mm2\n"
999
      "pxor %%mm3, %%mm3\n"
1000
      "pxor %%mm1, %%mm1\n"
1001
      "pcmpgtw %%mm0, %%mm3\n\t"
1002
      "pcmpgtw %%mm2, %%mm1\n\t"
1003
      "pxor %%mm3, %%mm0\n"
1004
      "pxor %%mm1, %%mm2\n"
1005
      "psubw %%mm3, %%mm0\n"
1006
      "psubw %%mm1, %%mm2\n"
1007
      "paddw %%mm0, %%mm2\n"
1008
      "paddw %%mm2, %%mm6\n"
1009

    
1010
      "add %2,%0\n"
1011
      "1:\n"
1012

    
1013
      "movq (%0),%%mm0\n"
1014
      "movq %%mm0, %%mm1\n"
1015
      "psllq $8, %%mm0\n"
1016
      "psrlq $8, %%mm1\n"
1017
      "psrlq $8, %%mm0\n"
1018
      "movq %%mm0, %%mm2\n"
1019
      "movq %%mm1, %%mm3\n"
1020
      "punpcklbw %%mm7,%%mm0\n"
1021
      "punpcklbw %%mm7,%%mm1\n"
1022
      "punpckhbw %%mm7,%%mm2\n"
1023
      "punpckhbw %%mm7,%%mm3\n"
1024
      "psubw %%mm1, %%mm0\n"
1025
      "psubw %%mm3, %%mm2\n"
1026
      "psubw %%mm0, %%mm4\n"
1027
      "psubw %%mm2, %%mm5\n"
1028
      "pxor %%mm3, %%mm3\n"
1029
      "pxor %%mm1, %%mm1\n"
1030
      "pcmpgtw %%mm4, %%mm3\n\t"
1031
      "pcmpgtw %%mm5, %%mm1\n\t"
1032
      "pxor %%mm3, %%mm4\n"
1033
      "pxor %%mm1, %%mm5\n"
1034
      "psubw %%mm3, %%mm4\n"
1035
      "psubw %%mm1, %%mm5\n"
1036
      "paddw %%mm4, %%mm5\n"
1037
      "paddw %%mm5, %%mm6\n"
1038

    
1039
      "add %2,%0\n"
1040

    
1041
      "movq (%0),%%mm4\n"
1042
      "movq %%mm4, %%mm1\n"
1043
      "psllq $8, %%mm4\n"
1044
      "psrlq $8, %%mm1\n"
1045
      "psrlq $8, %%mm4\n"
1046
      "movq %%mm4, %%mm5\n"
1047
      "movq %%mm1, %%mm3\n"
1048
      "punpcklbw %%mm7,%%mm4\n"
1049
      "punpcklbw %%mm7,%%mm1\n"
1050
      "punpckhbw %%mm7,%%mm5\n"
1051
      "punpckhbw %%mm7,%%mm3\n"
1052
      "psubw %%mm1, %%mm4\n"
1053
      "psubw %%mm3, %%mm5\n"
1054
      "psubw %%mm4, %%mm0\n"
1055
      "psubw %%mm5, %%mm2\n"
1056
      "pxor %%mm3, %%mm3\n"
1057
      "pxor %%mm1, %%mm1\n"
1058
      "pcmpgtw %%mm0, %%mm3\n\t"
1059
      "pcmpgtw %%mm2, %%mm1\n\t"
1060
      "pxor %%mm3, %%mm0\n"
1061
      "pxor %%mm1, %%mm2\n"
1062
      "psubw %%mm3, %%mm0\n"
1063
      "psubw %%mm1, %%mm2\n"
1064
      "paddw %%mm0, %%mm2\n"
1065
      "paddw %%mm2, %%mm6\n"
1066

    
1067
      "add %2,%0\n"
1068
      "subl $2, %%ecx\n"
1069
      " jnz 1b\n"
1070

    
1071
      "movq %%mm6, %%mm0\n"
1072
      "punpcklwd %%mm7,%%mm0\n"
1073
      "punpckhwd %%mm7,%%mm6\n"
1074
      "paddd %%mm0, %%mm6\n"
1075

    
1076
      "movq %%mm6,%%mm0\n"
1077
      "psrlq $32, %%mm6\n"
1078
      "paddd %%mm6,%%mm0\n"
1079
      "movd %%mm0,%1\n"
1080
      : "+r" (pix1), "=r"(tmp)
1081
      : "r" ((long)line_size) , "g" (h-2)
1082
      : "%ecx");
1083
      return tmp;
1084
}
1085

    
1086
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1087
    int tmp;
1088
    uint8_t * pix= pix1;
1089
  asm volatile (
1090
      "movl %3,%%ecx\n"
1091
      "pxor %%mm7,%%mm7\n"
1092
      "pxor %%mm6,%%mm6\n"
1093

    
1094
      "movq (%0),%%mm0\n"
1095
      "movq 1(%0),%%mm1\n"
1096
      "movq %%mm0, %%mm2\n"
1097
      "movq %%mm1, %%mm3\n"
1098
      "punpcklbw %%mm7,%%mm0\n"
1099
      "punpcklbw %%mm7,%%mm1\n"
1100
      "punpckhbw %%mm7,%%mm2\n"
1101
      "punpckhbw %%mm7,%%mm3\n"
1102
      "psubw %%mm1, %%mm0\n"
1103
      "psubw %%mm3, %%mm2\n"
1104

    
1105
      "add %2,%0\n"
1106

    
1107
      "movq (%0),%%mm4\n"
1108
      "movq 1(%0),%%mm1\n"
1109
      "movq %%mm4, %%mm5\n"
1110
      "movq %%mm1, %%mm3\n"
1111
      "punpcklbw %%mm7,%%mm4\n"
1112
      "punpcklbw %%mm7,%%mm1\n"
1113
      "punpckhbw %%mm7,%%mm5\n"
1114
      "punpckhbw %%mm7,%%mm3\n"
1115
      "psubw %%mm1, %%mm4\n"
1116
      "psubw %%mm3, %%mm5\n"
1117
      "psubw %%mm4, %%mm0\n"
1118
      "psubw %%mm5, %%mm2\n"
1119
      "pxor %%mm3, %%mm3\n"
1120
      "pxor %%mm1, %%mm1\n"
1121
      "pcmpgtw %%mm0, %%mm3\n\t"
1122
      "pcmpgtw %%mm2, %%mm1\n\t"
1123
      "pxor %%mm3, %%mm0\n"
1124
      "pxor %%mm1, %%mm2\n"
1125
      "psubw %%mm3, %%mm0\n"
1126
      "psubw %%mm1, %%mm2\n"
1127
      "paddw %%mm0, %%mm2\n"
1128
      "paddw %%mm2, %%mm6\n"
1129

    
1130
      "add %2,%0\n"
1131
      "1:\n"
1132

    
1133
      "movq (%0),%%mm0\n"
1134
      "movq 1(%0),%%mm1\n"
1135
      "movq %%mm0, %%mm2\n"
1136
      "movq %%mm1, %%mm3\n"
1137
      "punpcklbw %%mm7,%%mm0\n"
1138
      "punpcklbw %%mm7,%%mm1\n"
1139
      "punpckhbw %%mm7,%%mm2\n"
1140
      "punpckhbw %%mm7,%%mm3\n"
1141
      "psubw %%mm1, %%mm0\n"
1142
      "psubw %%mm3, %%mm2\n"
1143
      "psubw %%mm0, %%mm4\n"
1144
      "psubw %%mm2, %%mm5\n"
1145
      "pxor %%mm3, %%mm3\n"
1146
      "pxor %%mm1, %%mm1\n"
1147
      "pcmpgtw %%mm4, %%mm3\n\t"
1148
      "pcmpgtw %%mm5, %%mm1\n\t"
1149
      "pxor %%mm3, %%mm4\n"
1150
      "pxor %%mm1, %%mm5\n"
1151
      "psubw %%mm3, %%mm4\n"
1152
      "psubw %%mm1, %%mm5\n"
1153
      "paddw %%mm4, %%mm5\n"
1154
      "paddw %%mm5, %%mm6\n"
1155

    
1156
      "add %2,%0\n"
1157

    
1158
      "movq (%0),%%mm4\n"
1159
      "movq 1(%0),%%mm1\n"
1160
      "movq %%mm4, %%mm5\n"
1161
      "movq %%mm1, %%mm3\n"
1162
      "punpcklbw %%mm7,%%mm4\n"
1163
      "punpcklbw %%mm7,%%mm1\n"
1164
      "punpckhbw %%mm7,%%mm5\n"
1165
      "punpckhbw %%mm7,%%mm3\n"
1166
      "psubw %%mm1, %%mm4\n"
1167
      "psubw %%mm3, %%mm5\n"
1168
      "psubw %%mm4, %%mm0\n"
1169
      "psubw %%mm5, %%mm2\n"
1170
      "pxor %%mm3, %%mm3\n"
1171
      "pxor %%mm1, %%mm1\n"
1172
      "pcmpgtw %%mm0, %%mm3\n\t"
1173
      "pcmpgtw %%mm2, %%mm1\n\t"
1174
      "pxor %%mm3, %%mm0\n"
1175
      "pxor %%mm1, %%mm2\n"
1176
      "psubw %%mm3, %%mm0\n"
1177
      "psubw %%mm1, %%mm2\n"
1178
      "paddw %%mm0, %%mm2\n"
1179
      "paddw %%mm2, %%mm6\n"
1180

    
1181
      "add %2,%0\n"
1182
      "subl $2, %%ecx\n"
1183
      " jnz 1b\n"
1184

    
1185
      "movq %%mm6, %%mm0\n"
1186
      "punpcklwd %%mm7,%%mm0\n"
1187
      "punpckhwd %%mm7,%%mm6\n"
1188
      "paddd %%mm0, %%mm6\n"
1189

    
1190
      "movq %%mm6,%%mm0\n"
1191
      "psrlq $32, %%mm6\n"
1192
      "paddd %%mm6,%%mm0\n"
1193
      "movd %%mm0,%1\n"
1194
      : "+r" (pix1), "=r"(tmp)
1195
      : "r" ((long)line_size) , "g" (h-2)
1196
      : "%ecx");
1197
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1198
}
1199

    
1200
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1201
    MpegEncContext *c = p;
1202
    int score1, score2;
1203

    
1204
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1205
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1206
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1207

    
1208
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1209
    else  return score1 + FFABS(score2)*8;
1210
}
1211

    
1212
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1213
    MpegEncContext *c = p;
1214
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1215
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1216

    
1217
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1218
    else  return score1 + FFABS(score2)*8;
1219
}
1220

    
1221
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1222
    int tmp;
1223

    
1224
    assert( (((int)pix) & 7) == 0);
1225
    assert((line_size &7) ==0);
1226

    
1227
#define SUM(in0, in1, out0, out1) \
1228
      "movq (%0), %%mm2\n"\
1229
      "movq 8(%0), %%mm3\n"\
1230
      "add %2,%0\n"\
1231
      "movq %%mm2, " #out0 "\n"\
1232
      "movq %%mm3, " #out1 "\n"\
1233
      "psubusb " #in0 ", %%mm2\n"\
1234
      "psubusb " #in1 ", %%mm3\n"\
1235
      "psubusb " #out0 ", " #in0 "\n"\
1236
      "psubusb " #out1 ", " #in1 "\n"\
1237
      "por %%mm2, " #in0 "\n"\
1238
      "por %%mm3, " #in1 "\n"\
1239
      "movq " #in0 ", %%mm2\n"\
1240
      "movq " #in1 ", %%mm3\n"\
1241
      "punpcklbw %%mm7, " #in0 "\n"\
1242
      "punpcklbw %%mm7, " #in1 "\n"\
1243
      "punpckhbw %%mm7, %%mm2\n"\
1244
      "punpckhbw %%mm7, %%mm3\n"\
1245
      "paddw " #in1 ", " #in0 "\n"\
1246
      "paddw %%mm3, %%mm2\n"\
1247
      "paddw %%mm2, " #in0 "\n"\
1248
      "paddw " #in0 ", %%mm6\n"
1249

    
1250

    
1251
  asm volatile (
1252
      "movl %3,%%ecx\n"
1253
      "pxor %%mm6,%%mm6\n"
1254
      "pxor %%mm7,%%mm7\n"
1255
      "movq (%0),%%mm0\n"
1256
      "movq 8(%0),%%mm1\n"
1257
      "add %2,%0\n"
1258
      "subl $2, %%ecx\n"
1259
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1260
      "1:\n"
1261

    
1262
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1263

    
1264
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1265

    
1266
      "subl $2, %%ecx\n"
1267
      "jnz 1b\n"
1268

    
1269
      "movq %%mm6,%%mm0\n"
1270
      "psrlq $32, %%mm6\n"
1271
      "paddw %%mm6,%%mm0\n"
1272
      "movq %%mm0,%%mm6\n"
1273
      "psrlq $16, %%mm0\n"
1274
      "paddw %%mm6,%%mm0\n"
1275
      "movd %%mm0,%1\n"
1276
      : "+r" (pix), "=r"(tmp)
1277
      : "r" ((long)line_size) , "m" (h)
1278
      : "%ecx");
1279
    return tmp & 0xFFFF;
1280
}
1281
#undef SUM
1282

    
1283
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1284
    int tmp;
1285

    
1286
    assert( (((int)pix) & 7) == 0);
1287
    assert((line_size &7) ==0);
1288

    
1289
#define SUM(in0, in1, out0, out1) \
1290
      "movq (%0), " #out0 "\n"\
1291
      "movq 8(%0), " #out1 "\n"\
1292
      "add %2,%0\n"\
1293
      "psadbw " #out0 ", " #in0 "\n"\
1294
      "psadbw " #out1 ", " #in1 "\n"\
1295
      "paddw " #in1 ", " #in0 "\n"\
1296
      "paddw " #in0 ", %%mm6\n"
1297

    
1298
  asm volatile (
1299
      "movl %3,%%ecx\n"
1300
      "pxor %%mm6,%%mm6\n"
1301
      "pxor %%mm7,%%mm7\n"
1302
      "movq (%0),%%mm0\n"
1303
      "movq 8(%0),%%mm1\n"
1304
      "add %2,%0\n"
1305
      "subl $2, %%ecx\n"
1306
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1307
      "1:\n"
1308

    
1309
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1310

    
1311
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1312

    
1313
      "subl $2, %%ecx\n"
1314
      "jnz 1b\n"
1315

    
1316
      "movd %%mm6,%1\n"
1317
      : "+r" (pix), "=r"(tmp)
1318
      : "r" ((long)line_size) , "m" (h)
1319
      : "%ecx");
1320
    return tmp;
1321
}
1322
#undef SUM
1323

    
1324
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1325
    int tmp;
1326

    
1327
    assert( (((int)pix1) & 7) == 0);
1328
    assert( (((int)pix2) & 7) == 0);
1329
    assert((line_size &7) ==0);
1330

    
1331
#define SUM(in0, in1, out0, out1) \
1332
      "movq (%0),%%mm2\n"\
1333
      "movq (%1)," #out0 "\n"\
1334
      "movq 8(%0),%%mm3\n"\
1335
      "movq 8(%1)," #out1 "\n"\
1336
      "add %3,%0\n"\
1337
      "add %3,%1\n"\
1338
      "psubb " #out0 ", %%mm2\n"\
1339
      "psubb " #out1 ", %%mm3\n"\
1340
      "pxor %%mm7, %%mm2\n"\
1341
      "pxor %%mm7, %%mm3\n"\
1342
      "movq %%mm2, " #out0 "\n"\
1343
      "movq %%mm3, " #out1 "\n"\
1344
      "psubusb " #in0 ", %%mm2\n"\
1345
      "psubusb " #in1 ", %%mm3\n"\
1346
      "psubusb " #out0 ", " #in0 "\n"\
1347
      "psubusb " #out1 ", " #in1 "\n"\
1348
      "por %%mm2, " #in0 "\n"\
1349
      "por %%mm3, " #in1 "\n"\
1350
      "movq " #in0 ", %%mm2\n"\
1351
      "movq " #in1 ", %%mm3\n"\
1352
      "punpcklbw %%mm7, " #in0 "\n"\
1353
      "punpcklbw %%mm7, " #in1 "\n"\
1354
      "punpckhbw %%mm7, %%mm2\n"\
1355
      "punpckhbw %%mm7, %%mm3\n"\
1356
      "paddw " #in1 ", " #in0 "\n"\
1357
      "paddw %%mm3, %%mm2\n"\
1358
      "paddw %%mm2, " #in0 "\n"\
1359
      "paddw " #in0 ", %%mm6\n"
1360

    
1361

    
1362
  asm volatile (
1363
      "movl %4,%%ecx\n"
1364
      "pxor %%mm6,%%mm6\n"
1365
      "pcmpeqw %%mm7,%%mm7\n"
1366
      "psllw $15, %%mm7\n"
1367
      "packsswb %%mm7, %%mm7\n"
1368
      "movq (%0),%%mm0\n"
1369
      "movq (%1),%%mm2\n"
1370
      "movq 8(%0),%%mm1\n"
1371
      "movq 8(%1),%%mm3\n"
1372
      "add %3,%0\n"
1373
      "add %3,%1\n"
1374
      "subl $2, %%ecx\n"
1375
      "psubb %%mm2, %%mm0\n"
1376
      "psubb %%mm3, %%mm1\n"
1377
      "pxor %%mm7, %%mm0\n"
1378
      "pxor %%mm7, %%mm1\n"
1379
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1380
      "1:\n"
1381

    
1382
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1383

    
1384
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1385

    
1386
      "subl $2, %%ecx\n"
1387
      "jnz 1b\n"
1388

    
1389
      "movq %%mm6,%%mm0\n"
1390
      "psrlq $32, %%mm6\n"
1391
      "paddw %%mm6,%%mm0\n"
1392
      "movq %%mm0,%%mm6\n"
1393
      "psrlq $16, %%mm0\n"
1394
      "paddw %%mm6,%%mm0\n"
1395
      "movd %%mm0,%2\n"
1396
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1397
      : "r" ((long)line_size) , "m" (h)
1398
      : "%ecx");
1399
    return tmp & 0x7FFF;
1400
}
1401
#undef SUM
1402

    
1403
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1404
    int tmp;
1405

    
1406
    assert( (((int)pix1) & 7) == 0);
1407
    assert( (((int)pix2) & 7) == 0);
1408
    assert((line_size &7) ==0);
1409

    
1410
#define SUM(in0, in1, out0, out1) \
1411
      "movq (%0)," #out0 "\n"\
1412
      "movq (%1),%%mm2\n"\
1413
      "movq 8(%0)," #out1 "\n"\
1414
      "movq 8(%1),%%mm3\n"\
1415
      "add %3,%0\n"\
1416
      "add %3,%1\n"\
1417
      "psubb %%mm2, " #out0 "\n"\
1418
      "psubb %%mm3, " #out1 "\n"\
1419
      "pxor %%mm7, " #out0 "\n"\
1420
      "pxor %%mm7, " #out1 "\n"\
1421
      "psadbw " #out0 ", " #in0 "\n"\
1422
      "psadbw " #out1 ", " #in1 "\n"\
1423
      "paddw " #in1 ", " #in0 "\n"\
1424
      "paddw " #in0 ", %%mm6\n"
1425

    
1426
  asm volatile (
1427
      "movl %4,%%ecx\n"
1428
      "pxor %%mm6,%%mm6\n"
1429
      "pcmpeqw %%mm7,%%mm7\n"
1430
      "psllw $15, %%mm7\n"
1431
      "packsswb %%mm7, %%mm7\n"
1432
      "movq (%0),%%mm0\n"
1433
      "movq (%1),%%mm2\n"
1434
      "movq 8(%0),%%mm1\n"
1435
      "movq 8(%1),%%mm3\n"
1436
      "add %3,%0\n"
1437
      "add %3,%1\n"
1438
      "subl $2, %%ecx\n"
1439
      "psubb %%mm2, %%mm0\n"
1440
      "psubb %%mm3, %%mm1\n"
1441
      "pxor %%mm7, %%mm0\n"
1442
      "pxor %%mm7, %%mm1\n"
1443
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1444
      "1:\n"
1445

    
1446
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1447

    
1448
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1449

    
1450
      "subl $2, %%ecx\n"
1451
      "jnz 1b\n"
1452

    
1453
      "movd %%mm6,%2\n"
1454
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1455
      : "r" ((long)line_size) , "m" (h)
1456
      : "%ecx");
1457
    return tmp;
1458
}
1459
#undef SUM
1460

    
1461
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1462
    long i=0;
1463
    asm volatile(
1464
        "1:                             \n\t"
1465
        "movq  (%2, %0), %%mm0          \n\t"
1466
        "movq  (%1, %0), %%mm1          \n\t"
1467
        "psubb %%mm0, %%mm1             \n\t"
1468
        "movq %%mm1, (%3, %0)           \n\t"
1469
        "movq 8(%2, %0), %%mm0          \n\t"
1470
        "movq 8(%1, %0), %%mm1          \n\t"
1471
        "psubb %%mm0, %%mm1             \n\t"
1472
        "movq %%mm1, 8(%3, %0)          \n\t"
1473
        "add $16, %0                    \n\t"
1474
        "cmp %4, %0                     \n\t"
1475
        " jb 1b                         \n\t"
1476
        : "+r" (i)
1477
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1478
    );
1479
    for(; i<w; i++)
1480
        dst[i+0] = src1[i+0]-src2[i+0];
1481
}
1482

    
1483
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1484
    long i=0;
1485
    uint8_t l, lt;
1486

    
1487
    asm volatile(
1488
        "1:                             \n\t"
1489
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1490
        "movq  (%1, %0), %%mm1          \n\t" // T
1491
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1492
        "movq  (%2, %0), %%mm3          \n\t" // X
1493
        "movq %%mm2, %%mm4              \n\t" // L
1494
        "psubb %%mm0, %%mm2             \n\t"
1495
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1496
        "movq %%mm4, %%mm5              \n\t" // L
1497
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1498
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1499
        "pminub %%mm2, %%mm4            \n\t"
1500
        "pmaxub %%mm1, %%mm4            \n\t"
1501
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1502
        "movq %%mm3, (%3, %0)           \n\t"
1503
        "add $8, %0                     \n\t"
1504
        "cmp %4, %0                     \n\t"
1505
        " jb 1b                         \n\t"
1506
        : "+r" (i)
1507
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1508
    );
1509

    
1510
    l= *left;
1511
    lt= *left_top;
1512

    
1513
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1514

    
1515
    *left_top= src1[w-1];
1516
    *left    = src2[w-1];
1517
}
1518

    
1519
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1520
    "mov"#m" "#p1", "#a"              \n\t"\
1521
    "mov"#m" "#p2", "#t"              \n\t"\
1522
    "punpcklbw "#a", "#t"             \n\t"\
1523
    "punpcklbw "#a", "#a"             \n\t"\
1524
    "psubw     "#t", "#a"             \n\t"\
1525

    
1526
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1527
    uint8_t *p1b=p1, *p2b=p2;\
1528
    asm volatile(\
1529
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1530
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1531
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1532
        "add %4, %1                   \n\t"\
1533
        "add %4, %2                   \n\t"\
1534
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1535
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1536
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1537
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1538
        "mov"#m1" "#mm"0, %0          \n\t"\
1539
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1540
        "mov"#m1" %0, "#mm"0          \n\t"\
1541
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1542
        : "r"((long)stride), "r"((long)stride*3)\
1543
    );\
1544
}
1545
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1546

    
1547
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1548
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1549

    
1550
#define LBUTTERFLY2(a1,b1,a2,b2)\
1551
    "paddw " #b1 ", " #a1 "           \n\t"\
1552
    "paddw " #b2 ", " #a2 "           \n\t"\
1553
    "paddw " #b1 ", " #b1 "           \n\t"\
1554
    "paddw " #b2 ", " #b2 "           \n\t"\
1555
    "psubw " #a1 ", " #b1 "           \n\t"\
1556
    "psubw " #a2 ", " #b2 "           \n\t"
1557

    
1558
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1559
        LBUTTERFLY2(m0, m1, m2, m3)\
1560
        LBUTTERFLY2(m4, m5, m6, m7)\
1561
        LBUTTERFLY2(m0, m2, m1, m3)\
1562
        LBUTTERFLY2(m4, m6, m5, m7)\
1563
        LBUTTERFLY2(m0, m4, m1, m5)\
1564
        LBUTTERFLY2(m2, m6, m3, m7)\
1565

    
1566
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1567

    
1568
#define MMABS_MMX(a,z)\
1569
    "pxor " #z ", " #z "              \n\t"\
1570
    "pcmpgtw " #a ", " #z "           \n\t"\
1571
    "pxor " #z ", " #a "              \n\t"\
1572
    "psubw " #z ", " #a "             \n\t"
1573

    
1574
#define MMABS_MMX2(a,z)\
1575
    "pxor " #z ", " #z "              \n\t"\
1576
    "psubw " #a ", " #z "             \n\t"\
1577
    "pmaxsw " #z ", " #a "            \n\t"
1578

    
1579
#define MMABS_SSSE3(a,z)\
1580
    "pabsw " #a ", " #a "             \n\t"
1581

    
1582
#define MMABS_SUM(a,z, sum)\
1583
    MMABS(a,z)\
1584
    "paddusw " #a ", " #sum "         \n\t"
1585

    
1586
#define MMABS_SUM_8x8_NOSPILL\
1587
    MMABS(%%xmm0, %%xmm8)\
1588
    MMABS(%%xmm1, %%xmm9)\
1589
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1590
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1591
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1592
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1593
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1594
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1595
    "paddusw %%xmm1, %%xmm0           \n\t"
1596

    
1597
#ifdef ARCH_X86_64
1598
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1599
#else
1600
#define MMABS_SUM_8x8_SSE2\
1601
    "movdqa %%xmm7, (%1)              \n\t"\
1602
    MMABS(%%xmm0, %%xmm7)\
1603
    MMABS(%%xmm1, %%xmm7)\
1604
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1605
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1606
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1607
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1608
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1609
    "movdqa (%1), %%xmm2              \n\t"\
1610
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1611
    "paddusw %%xmm1, %%xmm0           \n\t"
1612
#endif
1613

    
1614
#define LOAD4(o, a, b, c, d)\
1615
    "movq "#o"(%1),    "#a"           \n\t"\
1616
    "movq "#o"+8(%1),  "#b"           \n\t"\
1617
    "movq "#o"+16(%1), "#c"           \n\t"\
1618
    "movq "#o"+24(%1), "#d"           \n\t"\
1619

    
1620
#define STORE4(o, a, b, c, d)\
1621
    "movq "#a", "#o"(%1)              \n\t"\
1622
    "movq "#b", "#o"+8(%1)            \n\t"\
1623
    "movq "#c", "#o"+16(%1)           \n\t"\
1624
    "movq "#d", "#o"+24(%1)           \n\t"\
1625

    
1626
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1627
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1628
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1629
#define HSUM_MMX(a, t, dst)\
1630
    "movq "#a", "#t"                  \n\t"\
1631
    "psrlq $32, "#a"                  \n\t"\
1632
    "paddusw "#t", "#a"               \n\t"\
1633
    "movq "#a", "#t"                  \n\t"\
1634
    "psrlq $16, "#a"                  \n\t"\
1635
    "paddusw "#t", "#a"               \n\t"\
1636
    "movd "#a", "#dst"                \n\t"\
1637

    
1638
#define HSUM_MMX2(a, t, dst)\
1639
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1640
    "paddusw "#t", "#a"               \n\t"\
1641
    "pshufw $0x01, "#a", "#t"         \n\t"\
1642
    "paddusw "#t", "#a"               \n\t"\
1643
    "movd "#a", "#dst"                \n\t"\
1644

    
1645
#define HSUM_SSE2(a, t, dst)\
1646
    "movhlps "#a", "#t"               \n\t"\
1647
    "paddusw "#t", "#a"               \n\t"\
1648
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1649
    "paddusw "#t", "#a"               \n\t"\
1650
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1651
    "paddusw "#t", "#a"               \n\t"\
1652
    "movd "#a", "#dst"                \n\t"\
1653

    
1654
#define HADAMARD8_DIFF_MMX(cpu) \
1655
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1656
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1657
    int sum;\
1658
\
1659
    assert(h==8);\
1660
\
1661
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1662
\
1663
    asm volatile(\
1664
        HADAMARD48\
1665
\
1666
        "movq %%mm7, 96(%1)             \n\t"\
1667
\
1668
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1669
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1670
\
1671
        "movq 96(%1), %%mm7             \n\t"\
1672
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1673
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1674
\
1675
        : "=r" (sum)\
1676
        : "r"(temp)\
1677
    );\
1678
\
1679
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1680
\
1681
    asm volatile(\
1682
        HADAMARD48\
1683
\
1684
        "movq %%mm7, 96(%1)             \n\t"\
1685
\
1686
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1687
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1688
\
1689
        "movq 96(%1), %%mm7             \n\t"\
1690
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1691
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1692
        "movq %%mm6, %%mm7              \n\t"\
1693
        "movq %%mm0, %%mm6              \n\t"\
1694
\
1695
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1696
\
1697
        HADAMARD48\
1698
        "movq %%mm7, 64(%1)             \n\t"\
1699
        MMABS(%%mm0, %%mm7)\
1700
        MMABS(%%mm1, %%mm7)\
1701
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1702
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1703
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1704
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1705
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1706
        "movq 64(%1), %%mm2             \n\t"\
1707
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1708
        "paddusw %%mm1, %%mm0           \n\t"\
1709
        "movq %%mm0, 64(%1)             \n\t"\
1710
\
1711
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1712
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1713
\
1714
        HADAMARD48\
1715
        "movq %%mm7, (%1)               \n\t"\
1716
        MMABS(%%mm0, %%mm7)\
1717
        MMABS(%%mm1, %%mm7)\
1718
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1719
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1720
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1721
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1722
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1723
        "movq (%1), %%mm2               \n\t"\
1724
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1725
        "paddusw 64(%1), %%mm0          \n\t"\
1726
        "paddusw %%mm1, %%mm0           \n\t"\
1727
\
1728
        HSUM(%%mm0, %%mm1, %0)\
1729
\
1730
        : "=r" (sum)\
1731
        : "r"(temp)\
1732
    );\
1733
    return sum&0xFFFF;\
1734
}\
1735
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1736

    
1737
#define HADAMARD8_DIFF_SSE2(cpu) \
1738
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1739
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1740
    int sum;\
1741
\
1742
    assert(h==8);\
1743
\
1744
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1745
\
1746
    asm volatile(\
1747
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1748
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1749
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1750
        MMABS_SUM_8x8\
1751
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1752
        : "=r" (sum)\
1753
        : "r"(temp)\
1754
    );\
1755
    return sum&0xFFFF;\
1756
}\
1757
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1758

    
1759
#define MMABS(a,z)         MMABS_MMX(a,z)
1760
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1761
HADAMARD8_DIFF_MMX(mmx)
1762
#undef MMABS
1763
#undef HSUM
1764

    
1765
#define MMABS(a,z)         MMABS_MMX2(a,z)
1766
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1767
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1768
HADAMARD8_DIFF_MMX(mmx2)
1769
HADAMARD8_DIFF_SSE2(sse2)
1770
#undef MMABS
1771
#undef MMABS_SUM_8x8
1772
#undef HSUM
1773

    
1774
#ifdef HAVE_SSSE3
1775
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1776
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1777
HADAMARD8_DIFF_SSE2(ssse3)
1778
#undef MMABS
1779
#undef MMABS_SUM_8x8
1780
#endif
1781

    
1782
#define DCT_SAD4(m,mm,o)\
1783
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1784
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1785
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1786
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1787
    MMABS_SUM(mm##2, mm##6, mm##0)\
1788
    MMABS_SUM(mm##3, mm##7, mm##1)\
1789
    MMABS_SUM(mm##4, mm##6, mm##0)\
1790
    MMABS_SUM(mm##5, mm##7, mm##1)\
1791

    
1792
#define DCT_SAD_MMX\
1793
    "pxor %%mm0, %%mm0                \n\t"\
1794
    "pxor %%mm1, %%mm1                \n\t"\
1795
    DCT_SAD4(q, %%mm, 0)\
1796
    DCT_SAD4(q, %%mm, 8)\
1797
    DCT_SAD4(q, %%mm, 64)\
1798
    DCT_SAD4(q, %%mm, 72)\
1799
    "paddusw %%mm1, %%mm0             \n\t"\
1800
    HSUM(%%mm0, %%mm1, %0)
1801

    
1802
#define DCT_SAD_SSE2\
1803
    "pxor %%xmm0, %%xmm0              \n\t"\
1804
    "pxor %%xmm1, %%xmm1              \n\t"\
1805
    DCT_SAD4(dqa, %%xmm, 0)\
1806
    DCT_SAD4(dqa, %%xmm, 64)\
1807
    "paddusw %%xmm1, %%xmm0           \n\t"\
1808
    HSUM(%%xmm0, %%xmm1, %0)
1809

    
1810
#define DCT_SAD_FUNC(cpu) \
1811
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1812
    int sum;\
1813
    asm volatile(\
1814
        DCT_SAD\
1815
        :"=r"(sum)\
1816
        :"r"(block)\
1817
    );\
1818
    return sum&0xFFFF;\
1819
}
1820

    
1821
#define DCT_SAD       DCT_SAD_MMX
1822
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1823
#define MMABS(a,z)    MMABS_MMX(a,z)
1824
DCT_SAD_FUNC(mmx)
1825
#undef MMABS
1826
#undef HSUM
1827

    
1828
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1829
#define MMABS(a,z)    MMABS_MMX2(a,z)
1830
DCT_SAD_FUNC(mmx2)
1831
#undef HSUM
1832
#undef DCT_SAD
1833

    
1834
#define DCT_SAD       DCT_SAD_SSE2
1835
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1836
DCT_SAD_FUNC(sse2)
1837
#undef MMABS
1838

    
1839
#ifdef HAVE_SSSE3
1840
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1841
DCT_SAD_FUNC(ssse3)
1842
#undef MMABS
1843
#endif
1844
#undef HSUM
1845
#undef DCT_SAD
1846

    
1847
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1848
    int sum;
1849
    long i=size;
1850
    asm volatile(
1851
        "pxor %%mm4, %%mm4 \n"
1852
        "1: \n"
1853
        "sub $8, %0 \n"
1854
        "movq (%2,%0), %%mm2 \n"
1855
        "movq (%3,%0,2), %%mm0 \n"
1856
        "movq 8(%3,%0,2), %%mm1 \n"
1857
        "punpckhbw %%mm2, %%mm3 \n"
1858
        "punpcklbw %%mm2, %%mm2 \n"
1859
        "psraw $8, %%mm3 \n"
1860
        "psraw $8, %%mm2 \n"
1861
        "psubw %%mm3, %%mm1 \n"
1862
        "psubw %%mm2, %%mm0 \n"
1863
        "pmaddwd %%mm1, %%mm1 \n"
1864
        "pmaddwd %%mm0, %%mm0 \n"
1865
        "paddd %%mm1, %%mm4 \n"
1866
        "paddd %%mm0, %%mm4 \n"
1867
        "jg 1b \n"
1868
        "movq %%mm4, %%mm3 \n"
1869
        "psrlq $32, %%mm3 \n"
1870
        "paddd %%mm3, %%mm4 \n"
1871
        "movd %%mm4, %1 \n"
1872
        :"+r"(i), "=r"(sum)
1873
        :"r"(pix1), "r"(pix2)
1874
    );
1875
    return sum;
1876
}
1877

    
1878
#endif //CONFIG_ENCODERS
1879

    
1880
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1881
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1882
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1883
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1884
        "movq "#in7", " #m3 "             \n\t" /* d */\
1885
        "movq "#in0", %%mm5               \n\t" /* D */\
1886
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1887
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1888
        "movq "#in1", %%mm5               \n\t" /* C */\
1889
        "movq "#in2", %%mm6               \n\t" /* B */\
1890
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1891
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1892
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1893
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1894
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1895
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1896
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1897
        "psraw $5, %%mm5                  \n\t"\
1898
        "packuswb %%mm5, %%mm5            \n\t"\
1899
        OP(%%mm5, out, %%mm7, d)
1900

    
1901
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1902
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1903
    uint64_t temp;\
1904
\
1905
    asm volatile(\
1906
        "pxor %%mm7, %%mm7                \n\t"\
1907
        "1:                               \n\t"\
1908
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1909
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1910
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1911
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1912
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1913
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1914
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1915
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1916
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1917
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1918
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1919
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1920
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1921
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1922
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1923
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1924
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1925
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1926
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1927
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1928
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1929
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1930
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1931
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1932
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1933
        "paddw %6, %%mm6                  \n\t"\
1934
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1935
        "psraw $5, %%mm0                  \n\t"\
1936
        "movq %%mm0, %5                   \n\t"\
1937
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1938
        \
1939
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1940
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1941
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1942
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1943
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1944
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1945
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1946
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1947
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1948
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1949
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1950
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1951
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1952
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1953
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1954
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1955
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1956
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1957
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1958
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
1959
        "paddw %6, %%mm1                  \n\t"\
1960
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
1961
        "psraw $5, %%mm3                  \n\t"\
1962
        "movq %5, %%mm1                   \n\t"\
1963
        "packuswb %%mm3, %%mm1            \n\t"\
1964
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
1965
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1966
        \
1967
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
1968
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
1969
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
1970
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
1971
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
1972
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
1973
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
1974
        "paddw %%mm1, %%mm5               \n\t" /* b */\
1975
        "paddw %%mm4, %%mm0               \n\t" /* c */\
1976
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1977
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
1978
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
1979
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
1980
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
1981
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
1982
        "paddw %%mm3, %%mm2               \n\t" /* d */\
1983
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
1984
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
1985
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
1986
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
1987
        "paddw %%mm2, %%mm6               \n\t" /* a */\
1988
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1989
        "paddw %6, %%mm0                  \n\t"\
1990
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1991
        "psraw $5, %%mm0                  \n\t"\
1992
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1993
        \
1994
        "paddw %%mm5, %%mm3               \n\t" /* a */\
1995
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
1996
        "paddw %%mm4, %%mm6               \n\t" /* b */\
1997
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
1998
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
1999
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2000
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2001
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2002
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2003
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2004
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2005
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2006
        "paddw %6, %%mm4                  \n\t"\
2007
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2008
        "psraw $5, %%mm4                  \n\t"\
2009
        "packuswb %%mm4, %%mm0            \n\t"\
2010
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2011
        \
2012
        "add %3, %0                       \n\t"\
2013
        "add %4, %1                       \n\t"\
2014
        "decl %2                          \n\t"\
2015
        " jnz 1b                          \n\t"\
2016
        : "+a"(src), "+c"(dst), "+m"(h)\
2017
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2018
        : "memory"\
2019
    );\
2020
}\
2021
\
2022
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2023
    int i;\
2024
    int16_t temp[16];\
2025
    /* quick HACK, XXX FIXME MUST be optimized */\
2026
    for(i=0; i<h; i++)\
2027
    {\
2028
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2029
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2030
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2031
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2032
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2033
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2034
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2035
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2036
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2037
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2038
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2039
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2040
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2041
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2042
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2043
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2044
        asm volatile(\
2045
            "movq (%0), %%mm0               \n\t"\
2046
            "movq 8(%0), %%mm1              \n\t"\
2047
            "paddw %2, %%mm0                \n\t"\
2048
            "paddw %2, %%mm1                \n\t"\
2049
            "psraw $5, %%mm0                \n\t"\
2050
            "psraw $5, %%mm1                \n\t"\
2051
            "packuswb %%mm1, %%mm0          \n\t"\
2052
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2053
            "movq 16(%0), %%mm0             \n\t"\
2054
            "movq 24(%0), %%mm1             \n\t"\
2055
            "paddw %2, %%mm0                \n\t"\
2056
            "paddw %2, %%mm1                \n\t"\
2057
            "psraw $5, %%mm0                \n\t"\
2058
            "psraw $5, %%mm1                \n\t"\
2059
            "packuswb %%mm1, %%mm0          \n\t"\
2060
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2061
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2062
            : "memory"\
2063
        );\
2064
        dst+=dstStride;\
2065
        src+=srcStride;\
2066
    }\
2067
}\
2068
\
2069
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2070
    uint64_t temp;\
2071
\
2072
    asm volatile(\
2073
        "pxor %%mm7, %%mm7                \n\t"\
2074
        "1:                               \n\t"\
2075
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2076
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2077
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2078
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2079
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2080
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2081
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2082
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2083
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2084
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2085
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2086
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2087
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2088
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2089
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2090
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2091
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2092
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2093
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2094
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2095
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2096
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2097
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2098
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2099
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2100
        "paddw %6, %%mm6                  \n\t"\
2101
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2102
        "psraw $5, %%mm0                  \n\t"\
2103
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2104
        \
2105
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2106
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2107
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2108
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2109
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2110
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2111
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2112
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2113
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2114
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2115
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2116
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2117
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2118
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2119
        "paddw %6, %%mm1                  \n\t"\
2120
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2121
        "psraw $5, %%mm3                  \n\t"\
2122
        "packuswb %%mm3, %%mm0            \n\t"\
2123
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2124
        \
2125
        "add %3, %0                       \n\t"\
2126
        "add %4, %1                       \n\t"\
2127
        "decl %2                          \n\t"\
2128
        " jnz 1b                          \n\t"\
2129
        : "+a"(src), "+c"(dst), "+m"(h)\
2130
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2131
        : "memory"\
2132
    );\
2133
}\
2134
\
2135
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2136
    int i;\
2137
    int16_t temp[8];\
2138
    /* quick HACK, XXX FIXME MUST be optimized */\
2139
    for(i=0; i<h; i++)\
2140
    {\
2141
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2142
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2143
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2144
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2145
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2146
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2147
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2148
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2149
        asm volatile(\
2150
            "movq (%0), %%mm0           \n\t"\
2151
            "movq 8(%0), %%mm1          \n\t"\
2152
            "paddw %2, %%mm0            \n\t"\
2153
            "paddw %2, %%mm1            \n\t"\
2154
            "psraw $5, %%mm0            \n\t"\
2155
            "psraw $5, %%mm1            \n\t"\
2156
            "packuswb %%mm1, %%mm0      \n\t"\
2157
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2158
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2159
            :"memory"\
2160
        );\
2161
        dst+=dstStride;\
2162
        src+=srcStride;\
2163
    }\
2164
}
2165

    
2166
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2167
\
2168
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2169
    uint64_t temp[17*4];\
2170
    uint64_t *temp_ptr= temp;\
2171
    int count= 17;\
2172
\
2173
    /*FIXME unroll */\
2174
    asm volatile(\
2175
        "pxor %%mm7, %%mm7              \n\t"\
2176
        "1:                             \n\t"\
2177
        "movq (%0), %%mm0               \n\t"\
2178
        "movq (%0), %%mm1               \n\t"\
2179
        "movq 8(%0), %%mm2              \n\t"\
2180
        "movq 8(%0), %%mm3              \n\t"\
2181
        "punpcklbw %%mm7, %%mm0         \n\t"\
2182
        "punpckhbw %%mm7, %%mm1         \n\t"\
2183
        "punpcklbw %%mm7, %%mm2         \n\t"\
2184
        "punpckhbw %%mm7, %%mm3         \n\t"\
2185
        "movq %%mm0, (%1)               \n\t"\
2186
        "movq %%mm1, 17*8(%1)           \n\t"\
2187
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2188
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2189
        "add $8, %1                     \n\t"\
2190
        "add %3, %0                     \n\t"\
2191
        "decl %2                        \n\t"\
2192
        " jnz 1b                        \n\t"\
2193
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2194
        : "r" ((long)srcStride)\
2195
        : "memory"\
2196
    );\
2197
    \
2198
    temp_ptr= temp;\
2199
    count=4;\
2200
    \
2201
/*FIXME reorder for speed */\
2202
    asm volatile(\
2203
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2204
        "1:                             \n\t"\
2205
        "movq (%0), %%mm0               \n\t"\
2206
        "movq 8(%0), %%mm1              \n\t"\
2207
        "movq 16(%0), %%mm2             \n\t"\
2208
        "movq 24(%0), %%mm3             \n\t"\
2209
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2210
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2211
        "add %4, %1                     \n\t"\
2212
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2213
        \
2214
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2215
        "add %4, %1                     \n\t"\
2216
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2217
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2218
        "add %4, %1                     \n\t"\
2219
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2220
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2221
        "add %4, %1                     \n\t"\
2222
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2223
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2224
        "add %4, %1                     \n\t"\
2225
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2226
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2227
        "add %4, %1                     \n\t"\
2228
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2229
        \
2230
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2231
        "add %4, %1                     \n\t"  \
2232
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2233
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2234
        \
2235
        "add $136, %0                   \n\t"\
2236
        "add %6, %1                     \n\t"\
2237
        "decl %2                        \n\t"\
2238
        " jnz 1b                        \n\t"\
2239
        \
2240
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2241
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2242
        :"memory"\
2243
    );\
2244
}\
2245
\
2246
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2247
    uint64_t temp[9*2];\
2248
    uint64_t *temp_ptr= temp;\
2249
    int count= 9;\
2250
\
2251
    /*FIXME unroll */\
2252
    asm volatile(\
2253
        "pxor %%mm7, %%mm7              \n\t"\
2254
        "1:                             \n\t"\
2255
        "movq (%0), %%mm0               \n\t"\
2256
        "movq (%0), %%mm1               \n\t"\
2257
        "punpcklbw %%mm7, %%mm0         \n\t"\
2258
        "punpckhbw %%mm7, %%mm1         \n\t"\
2259
        "movq %%mm0, (%1)               \n\t"\
2260
        "movq %%mm1, 9*8(%1)            \n\t"\
2261
        "add $8, %1                     \n\t"\
2262
        "add %3, %0                     \n\t"\
2263
        "decl %2                        \n\t"\
2264
        " jnz 1b                        \n\t"\
2265
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2266
        : "r" ((long)srcStride)\
2267
        : "memory"\
2268
    );\
2269
    \
2270
    temp_ptr= temp;\
2271
    count=2;\
2272
    \
2273
/*FIXME reorder for speed */\
2274
    asm volatile(\
2275
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2276
        "1:                             \n\t"\
2277
        "movq (%0), %%mm0               \n\t"\
2278
        "movq 8(%0), %%mm1              \n\t"\
2279
        "movq 16(%0), %%mm2             \n\t"\
2280
        "movq 24(%0), %%mm3             \n\t"\
2281
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2282
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2283
        "add %4, %1                     \n\t"\
2284
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2285
        \
2286
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2287
        "add %4, %1                     \n\t"\
2288
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2289
        \
2290
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2291
        "add %4, %1                     \n\t"\
2292
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2293
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2294
                \
2295
        "add $72, %0                    \n\t"\
2296
        "add %6, %1                     \n\t"\
2297
        "decl %2                        \n\t"\
2298
        " jnz 1b                        \n\t"\
2299
         \
2300
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2301
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2302
        : "memory"\
2303
   );\
2304
}\
2305
\
2306
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2307
    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
2308
}\
2309
\
2310
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2311
    uint64_t temp[8];\
2312
    uint8_t * const half= (uint8_t*)temp;\
2313
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2314
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2315
}\
2316
\
2317
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2318
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2319
}\
2320
\
2321
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2322
    uint64_t temp[8];\
2323
    uint8_t * const half= (uint8_t*)temp;\
2324
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2325
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2326
}\
2327
\
2328
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2329
    uint64_t temp[8];\
2330
    uint8_t * const half= (uint8_t*)temp;\
2331
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2332
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2333
}\
2334
\
2335
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2336
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2337
}\
2338
\
2339
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2340
    uint64_t temp[8];\
2341
    uint8_t * const half= (uint8_t*)temp;\
2342
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2343
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2344
}\
2345
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2346
    uint64_t half[8 + 9];\
2347
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2348
    uint8_t * const halfHV= ((uint8_t*)half);\
2349
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2350
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2351
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2352
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2353
}\
2354
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2355
    uint64_t half[8 + 9];\
2356
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2357
    uint8_t * const halfHV= ((uint8_t*)half);\
2358
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2359
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2360
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2361
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2362
}\
2363
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2364
    uint64_t half[8 + 9];\
2365
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2366
    uint8_t * const halfHV= ((uint8_t*)half);\
2367
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2368
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2369
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2370
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2371
}\
2372
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2373
    uint64_t half[8 + 9];\
2374
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2375
    uint8_t * const halfHV= ((uint8_t*)half);\
2376
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2377
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2378
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2379
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2380
}\
2381
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2382
    uint64_t half[8 + 9];\
2383
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2384
    uint8_t * const halfHV= ((uint8_t*)half);\
2385
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2386
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2387
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2388
}\
2389
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2390
    uint64_t half[8 + 9];\
2391
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2392
    uint8_t * const halfHV= ((uint8_t*)half);\
2393
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2394
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2395
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2396
}\
2397
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2398
    uint64_t half[8 + 9];\
2399
    uint8_t * const halfH= ((uint8_t*)half);\
2400
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2401
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2402
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2403
}\
2404
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2405
    uint64_t half[8 + 9];\
2406
    uint8_t * const halfH= ((uint8_t*)half);\
2407
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2408
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2409
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2410
}\
2411
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2412
    uint64_t half[9];\
2413
    uint8_t * const halfH= ((uint8_t*)half);\
2414
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2415
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2416
}\
2417
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2418
    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
2419
}\
2420
\
2421
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2422
    uint64_t temp[32];\
2423
    uint8_t * const half= (uint8_t*)temp;\
2424
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2425
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2426
}\
2427
\
2428
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2429
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2430
}\
2431
\
2432
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2433
    uint64_t temp[32];\
2434
    uint8_t * const half= (uint8_t*)temp;\
2435
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2436
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2437
}\
2438
\
2439
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2440
    uint64_t temp[32];\
2441
    uint8_t * const half= (uint8_t*)temp;\
2442
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2443
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2444
}\
2445
\
2446
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2447
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2448
}\
2449
\
2450
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2451
    uint64_t temp[32];\
2452
    uint8_t * const half= (uint8_t*)temp;\
2453
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2454
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2455
}\
2456
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2457
    uint64_t half[16*2 + 17*2];\
2458
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2459
    uint8_t * const halfHV= ((uint8_t*)half);\
2460
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2461
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2462
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2463
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2464
}\
2465
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2466
    uint64_t half[16*2 + 17*2];\
2467
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2468
    uint8_t * const halfHV= ((uint8_t*)half);\
2469
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2470
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2471
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2472
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2473
}\
2474
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2475
    uint64_t half[16*2 + 17*2];\
2476
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2477
    uint8_t * const halfHV= ((uint8_t*)half);\
2478
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2479
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2480
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2481
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2482
}\
2483
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2484
    uint64_t half[16*2 + 17*2];\
2485
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2486
    uint8_t * const halfHV= ((uint8_t*)half);\
2487
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2488
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2489
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2490
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2491
}\
2492
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2493
    uint64_t half[16*2 + 17*2];\
2494
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2495
    uint8_t * const halfHV= ((uint8_t*)half);\
2496
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2497
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2498
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2499
}\
2500
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2501
    uint64_t half[16*2 + 17*2];\
2502
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2503
    uint8_t * const halfHV= ((uint8_t*)half);\
2504
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2505
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2506
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2507
}\
2508
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2509
    uint64_t half[17*2];\
2510
    uint8_t * const halfH= ((uint8_t*)half);\
2511
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2512
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2513
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2514
}\
2515
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2516
    uint64_t half[17*2];\
2517
    uint8_t * const halfH= ((uint8_t*)half);\
2518
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2519
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2520
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2521
}\
2522
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2523
    uint64_t half[17*2];\
2524
    uint8_t * const halfH= ((uint8_t*)half);\
2525
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2526
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2527
}
2528

    
2529
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2530
#define AVG_3DNOW_OP(a,b,temp, size) \
2531
"mov" #size " " #b ", " #temp "   \n\t"\
2532
"pavgusb " #temp ", " #a "        \n\t"\
2533
"mov" #size " " #a ", " #b "      \n\t"
2534
#define AVG_MMX2_OP(a,b,temp, size) \
2535
"mov" #size " " #b ", " #temp "   \n\t"\
2536
"pavgb " #temp ", " #a "          \n\t"\
2537
"mov" #size " " #a ", " #b "      \n\t"
2538

    
2539
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2540
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2541
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2542
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2543
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2544
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2545
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2546
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2547
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2548

    
2549
/***********************************/
2550
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2551

    
2552
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2553
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2554
    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2555
}
2556
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2557
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2558
    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2559
}
2560

    
2561
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
2562
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2563
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2564
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2565
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2566
                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2567
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2568
                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2569
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2570
                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2571
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2572
    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2573
}\
2574
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2575
    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2576
}\
2577
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
2578
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
2579
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
2580
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
2581
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
2582
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
2583
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
2584
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2585

    
2586
QPEL_2TAP(put_, 16, mmx2)
2587
QPEL_2TAP(avg_, 16, mmx2)
2588
QPEL_2TAP(put_,  8, mmx2)
2589
QPEL_2TAP(avg_,  8, mmx2)
2590
QPEL_2TAP(put_, 16, 3dnow)
2591
QPEL_2TAP(avg_, 16, 3dnow)
2592
QPEL_2TAP(put_,  8, 3dnow)
2593
QPEL_2TAP(avg_,  8, 3dnow)
2594

    
2595

    
2596
#if 0
2597
static void just_return() { return; }
2598
#endif
2599

    
2600
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2601
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2602
    const int w = 8;
2603
    const int ix = ox>>(16+shift);
2604
    const int iy = oy>>(16+shift);
2605
    const int oxs = ox>>4;
2606
    const int oys = oy>>4;
2607
    const int dxxs = dxx>>4;
2608
    const int dxys = dxy>>4;
2609
    const int dyxs = dyx>>4;
2610
    const int dyys = dyy>>4;
2611
    const uint16_t r4[4] = {r,r,r,r};
2612
    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2613
    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2614
    const uint64_t shift2 = 2*shift;
2615
    uint8_t edge_buf[(h+1)*stride];
2616
    int x, y;
2617

    
2618
    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2619
    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2620
    const int dxh = dxy*(h-1);
2621
    const int dyw = dyx*(w-1);
2622
    if( // non-constant fullpel offset (3% of blocks)
2623
        ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
2624
         (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
2625
        // uses more than 16 bits of subpel mv (only at huge resolution)
2626
        || (dxx|dxy|dyx|dyy)&15 )
2627
    {
2628
        //FIXME could still use mmx for some of the rows
2629
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2630
        return;
2631
    }
2632

    
2633
    src += ix + iy*stride;
2634
    if( (unsigned)ix >= width-w ||
2635
        (unsigned)iy >= height-h )
2636
    {
2637
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2638
        src = edge_buf;
2639
    }
2640

    
2641
    asm volatile(
2642
        "movd         %0, %%mm6 \n\t"
2643
        "pxor      %%mm7, %%mm7 \n\t"
2644
        "punpcklwd %%mm6, %%mm6 \n\t"
2645
        "punpcklwd %%mm6, %%mm6 \n\t"
2646
        :: "r"(1<<shift)
2647
    );
2648

    
2649
    for(x=0; x<w; x+=4){
2650
        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2651
                            oxs - dxys + dxxs*(x+1),
2652
                            oxs - dxys + dxxs*(x+2),
2653
                            oxs - dxys + dxxs*(x+3) };
2654
        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2655
                            oys - dyys + dyxs*(x+1),
2656
                            oys - dyys + dyxs*(x+2),
2657
                            oys - dyys + dyxs*(x+3) };
2658

    
2659
        for(y=0; y<h; y++){
2660
            asm volatile(
2661
                "movq   %0,  %%mm4 \n\t"
2662
                "movq   %1,  %%mm5 \n\t"
2663
                "paddw  %2,  %%mm4 \n\t"
2664
                "paddw  %3,  %%mm5 \n\t"
2665
                "movq   %%mm4, %0  \n\t"
2666
                "movq   %%mm5, %1  \n\t"
2667
                "psrlw  $12, %%mm4 \n\t"
2668
                "psrlw  $12, %%mm5 \n\t"
2669
                : "+m"(*dx4), "+m"(*dy4)
2670
                : "m"(*dxy4), "m"(*dyy4)
2671
            );
2672

    
2673
            asm volatile(
2674
                "movq   %%mm6, %%mm2 \n\t"
2675
                "movq   %%mm6, %%mm1 \n\t"
2676
                "psubw  %%mm4, %%mm2 \n\t"
2677
                "psubw  %%mm5, %%mm1 \n\t"
2678
                "movq   %%mm2, %%mm0 \n\t"
2679
                "movq   %%mm4, %%mm3 \n\t"
2680
                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2681
                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2682
                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2683
                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2684

    
2685
                "movd   %4,    %%mm5 \n\t"
2686
                "movd   %3,    %%mm4 \n\t"
2687
                "punpcklbw %%mm7, %%mm5 \n\t"
2688
                "punpcklbw %%mm7, %%mm4 \n\t"
2689
                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2690
                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2691

    
2692
                "movd   %2,    %%mm5 \n\t"
2693
                "movd   %1,    %%mm4 \n\t"
2694
                "punpcklbw %%mm7, %%mm5 \n\t"
2695
                "punpcklbw %%mm7, %%mm4 \n\t"
2696
                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2697
                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2698
                "paddw  %5,    %%mm1 \n\t"
2699
                "paddw  %%mm3, %%mm2 \n\t"
2700
                "paddw  %%mm1, %%mm0 \n\t"
2701
                "paddw  %%mm2, %%mm0 \n\t"
2702

    
2703
                "psrlw    %6,    %%mm0 \n\t"
2704
                "packuswb %%mm0, %%mm0 \n\t"
2705
                "movd     %%mm0, %0    \n\t"
2706

    
2707
                : "=m"(dst[x+y*stride])
2708
                : "m"(src[0]), "m"(src[1]),
2709
                  "m"(src[stride]), "m"(src[stride+1]),
2710
                  "m"(*r4), "m"(shift2)
2711
            );
2712
            src += stride;
2713
        }
2714
        src += 4-h*stride;
2715
    }
2716
}
2717

    
2718
#ifdef CONFIG_ENCODERS
2719

    
2720
#define PHADDD(a, t)\
2721
    "movq "#a", "#t"                  \n\t"\
2722
    "psrlq $32, "#a"                  \n\t"\
2723
    "paddd "#t", "#a"                 \n\t"
2724
/*
2725
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2726
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2727
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2728
 */
2729
#define PMULHRW(x, y, s, o)\
2730
    "pmulhw " #s ", "#x "            \n\t"\
2731
    "pmulhw " #s ", "#y "            \n\t"\
2732
    "paddw " #o ", "#x "             \n\t"\
2733
    "paddw " #o ", "#y "             \n\t"\
2734
    "psraw $1, "#x "                 \n\t"\
2735
    "psraw $1, "#y "                 \n\t"
2736
#define DEF(x) x ## _mmx
2737
#define SET_RND MOVQ_WONE
2738
#define SCALE_OFFSET 1
2739

    
2740
#include "dsputil_mmx_qns.h"
2741

    
2742
#undef DEF
2743
#undef SET_RND
2744
#undef SCALE_OFFSET
2745
#undef PMULHRW
2746

    
2747
#define DEF(x) x ## _3dnow
2748
#define SET_RND(x)
2749
#define SCALE_OFFSET 0
2750
#define PMULHRW(x, y, s, o)\
2751
    "pmulhrw " #s ", "#x "           \n\t"\
2752
    "pmulhrw " #s ", "#y "           \n\t"
2753

    
2754
#include "dsputil_mmx_qns.h"
2755

    
2756
#undef DEF
2757
#undef SET_RND
2758
#undef SCALE_OFFSET
2759
#undef PMULHRW
2760

    
2761
#ifdef HAVE_SSSE3
2762
#undef PHADDD
2763
#define DEF(x) x ## _ssse3
2764
#define SET_RND(x)
2765
#define SCALE_OFFSET -1
2766
#define PHADDD(a, t)\
2767
    "pshufw $0x0E, "#a", "#t"         \n\t"\
2768
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
2769
#define PMULHRW(x, y, s, o)\
2770
    "pmulhrsw " #s ", "#x "          \n\t"\
2771
    "pmulhrsw " #s ", "#y "          \n\t"
2772

    
2773
#include "dsputil_mmx_qns.h"
2774

    
2775
#undef DEF
2776
#undef SET_RND
2777
#undef SCALE_OFFSET
2778
#undef PMULHRW
2779
#undef PHADDD
2780
#endif //HAVE_SSSE3
2781

    
2782
#endif /* CONFIG_ENCODERS */
2783

    
2784
#define PREFETCH(name, op) \
2785
static void name(void *mem, int stride, int h){\
2786
    const uint8_t *p= mem;\
2787
    do{\
2788
        asm volatile(#op" %0" :: "m"(*p));\
2789
        p+= stride;\
2790
    }while(--h);\
2791
}
2792
PREFETCH(prefetch_mmx2,  prefetcht0)
2793
PREFETCH(prefetch_3dnow, prefetch)
2794
#undef PREFETCH
2795

    
2796
#include "h264dsp_mmx.c"
2797

    
2798
/* CAVS specific */
2799
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2800

    
2801
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2802
    put_pixels8_mmx(dst, src, stride, 8);
2803
}
2804
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2805
    avg_pixels8_mmx(dst, src, stride, 8);
2806
}
2807
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2808
    put_pixels16_mmx(dst, src, stride, 16);
2809
}
2810
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2811
    avg_pixels16_mmx(dst, src, stride, 16);
2812
}
2813

    
2814
/* FLAC specific */
2815
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2816
                                   double *autoc);
2817

    
2818
/* VC1 specific */
2819
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2820

    
2821
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2822
    put_pixels8_mmx(dst, src, stride, 8);
2823
}
2824

    
2825
/* external functions, from idct_mmx.c */
2826
void ff_mmx_idct(DCTELEM *block);
2827
void ff_mmxext_idct(DCTELEM *block);
2828

    
2829
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2830
   converted */
2831
#ifdef CONFIG_GPL
2832
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2833
{
2834
    ff_mmx_idct (block);
2835
    put_pixels_clamped_mmx(block, dest, line_size);
2836
}
2837
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2838
{
2839
    ff_mmx_idct (block);
2840
    add_pixels_clamped_mmx(block, dest, line_size);
2841
}
2842
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2843
{
2844
    ff_mmxext_idct (block);
2845
    put_pixels_clamped_mmx(block, dest, line_size);
2846
}
2847
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2848
{
2849
    ff_mmxext_idct (block);
2850
    add_pixels_clamped_mmx(block, dest, line_size);
2851
}
2852
#endif
2853
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2854
{
2855
    ff_idct_xvid_mmx (block);
2856
    put_pixels_clamped_mmx(block, dest, line_size);
2857
}
2858
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2859
{
2860
    ff_idct_xvid_mmx (block);
2861
    add_pixels_clamped_mmx(block, dest, line_size);
2862
}
2863
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2864
{
2865
    ff_idct_xvid_mmx2 (block);
2866
    put_pixels_clamped_mmx(block, dest, line_size);
2867
}
2868
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2869
{
2870
    ff_idct_xvid_mmx2 (block);
2871
    add_pixels_clamped_mmx(block, dest, line_size);
2872
}
2873

    
2874
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2875
{
2876
    int i;
2877
    asm volatile("pxor %%mm7, %%mm7":);
2878
    for(i=0; i<blocksize; i+=2) {
2879
        asm volatile(
2880
            "movq    %0,    %%mm0 \n\t"
2881
            "movq    %1,    %%mm1 \n\t"
2882
            "movq    %%mm0, %%mm2 \n\t"
2883
            "movq    %%mm1, %%mm3 \n\t"
2884
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2885
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2886
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2887
            "pxor    %%mm2, %%mm1 \n\t"
2888
            "movq    %%mm3, %%mm4 \n\t"
2889
            "pand    %%mm1, %%mm3 \n\t"
2890
            "pandn   %%mm1, %%mm4 \n\t"
2891
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2892
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2893
            "movq    %%mm3, %1    \n\t"
2894
            "movq    %%mm0, %0    \n\t"
2895
            :"+m"(mag[i]), "+m"(ang[i])
2896
            ::"memory"
2897
        );
2898
    }
2899
    asm volatile("femms");
2900
}
2901
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2902
{
2903
    int i;
2904

    
2905
    asm volatile(
2906
            "movaps  %0,     %%xmm5 \n\t"
2907
        ::"m"(ff_pdw_80000000[0])
2908
    );
2909
    for(i=0; i<blocksize; i+=4) {
2910
        asm volatile(
2911
            "movaps  %0,     %%xmm0 \n\t"
2912
            "movaps  %1,     %%xmm1 \n\t"
2913
            "xorps   %%xmm2, %%xmm2 \n\t"
2914
            "xorps   %%xmm3, %%xmm3 \n\t"
2915
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2916
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2917
            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2918
            "xorps   %%xmm2, %%xmm1 \n\t"
2919
            "movaps  %%xmm3, %%xmm4 \n\t"
2920
            "andps   %%xmm1, %%xmm3 \n\t"
2921
            "andnps  %%xmm1, %%xmm4 \n\t"
2922
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2923
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2924
            "movaps  %%xmm3, %1     \n\t"
2925
            "movaps  %%xmm0, %0     \n\t"
2926
            :"+m"(mag[i]), "+m"(ang[i])
2927
            ::"memory"
2928
        );
2929
    }
2930
}
2931

    
2932
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2933
    long i = (len-4)*4;
2934
    asm volatile(
2935
        "1: \n\t"
2936
        "movq    (%1,%0), %%mm0 \n\t"
2937
        "movq   8(%1,%0), %%mm1 \n\t"
2938
        "pfmul   (%2,%0), %%mm0 \n\t"
2939
        "pfmul  8(%2,%0), %%mm1 \n\t"
2940
        "movq   %%mm0,  (%1,%0) \n\t"
2941
        "movq   %%mm1, 8(%1,%0) \n\t"
2942
        "sub  $16, %0 \n\t"
2943
        "jge 1b \n\t"
2944
        "femms  \n\t"
2945
        :"+r"(i)
2946
        :"r"(dst), "r"(src)
2947
        :"memory"
2948
    );
2949
}
2950
static void vector_fmul_sse(float *dst, const float *src, int len){
2951
    long i = (len-8)*4;
2952
    asm volatile(
2953
        "1: \n\t"
2954
        "movaps    (%1,%0), %%xmm0 \n\t"
2955
        "movaps  16(%1,%0), %%xmm1 \n\t"
2956
        "mulps     (%2,%0), %%xmm0 \n\t"
2957
        "mulps   16(%2,%0), %%xmm1 \n\t"
2958
        "movaps  %%xmm0,   (%1,%0) \n\t"
2959
        "movaps  %%xmm1, 16(%1,%0) \n\t"
2960
        "sub  $32, %0 \n\t"
2961
        "jge 1b \n\t"
2962
        :"+r"(i)
2963
        :"r"(dst), "r"(src)
2964
        :"memory"
2965
    );
2966
}
2967

    
2968
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2969
    long i = len*4-16;
2970
    asm volatile(
2971
        "1: \n\t"
2972
        "pswapd   8(%1), %%mm0 \n\t"
2973
        "pswapd    (%1), %%mm1 \n\t"
2974
        "pfmul  (%3,%0), %%mm0 \n\t"
2975
        "pfmul 8(%3,%0), %%mm1 \n\t"
2976
        "movq  %%mm0,  (%2,%0) \n\t"
2977
        "movq  %%mm1, 8(%2,%0) \n\t"
2978
        "add   $16, %1 \n\t"
2979
        "sub   $16, %0 \n\t"
2980
        "jge   1b \n\t"
2981
        :"+r"(i), "+r"(src1)
2982
        :"r"(dst), "r"(src0)
2983
    );
2984
    asm volatile("femms");
2985
}
2986
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2987
    long i = len*4-32;
2988
    asm volatile(
2989
        "1: \n\t"
2990
        "movaps        16(%1), %%xmm0 \n\t"
2991
        "movaps          (%1), %%xmm1 \n\t"
2992
        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2993
        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2994
        "mulps        (%3,%0), %%xmm0 \n\t"
2995
        "mulps      16(%3,%0), %%xmm1 \n\t"
2996
        "movaps     %%xmm0,   (%2,%0) \n\t"
2997
        "movaps     %%xmm1, 16(%2,%0) \n\t"
2998
        "add    $32, %1 \n\t"
2999
        "sub    $32, %0 \n\t"
3000
        "jge    1b \n\t"
3001
        :"+r"(i), "+r"(src1)
3002
        :"r"(dst), "r"(src0)
3003
    );
3004
}
3005

    
3006
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3007
                                      const float *src2, int src3, int len, int step){
3008
    long i = (len-4)*4;
3009
    if(step == 2 && src3 == 0){
3010
        dst += (len-4)*2;
3011
        asm volatile(
3012
            "1: \n\t"
3013
            "movq   (%2,%0),  %%mm0 \n\t"
3014
            "movq  8(%2,%0),  %%mm1 \n\t"
3015
            "pfmul  (%3,%0),  %%mm0 \n\t"
3016
            "pfmul 8(%3,%0),  %%mm1 \n\t"
3017
            "pfadd  (%4,%0),  %%mm0 \n\t"
3018
            "pfadd 8(%4,%0),  %%mm1 \n\t"
3019
            "movd     %%mm0,   (%1) \n\t"
3020
            "movd     %%mm1, 16(%1) \n\t"
3021
            "psrlq      $32,  %%mm0 \n\t"
3022
            "psrlq      $32,  %%mm1 \n\t"
3023
            "movd     %%mm0,  8(%1) \n\t"
3024
            "movd     %%mm1, 24(%1) \n\t"
3025
            "sub  $32, %1 \n\t"
3026
            "sub  $16, %0 \n\t"
3027
            "jge  1b \n\t"
3028
            :"+r"(i), "+r"(dst)
3029
            :"r"(src0), "r"(src1), "r"(src2)
3030
            :"memory"
3031
        );
3032
    }
3033
    else if(step == 1 && src3 == 0){
3034
        asm volatile(
3035
            "1: \n\t"
3036
            "movq    (%2,%0), %%mm0 \n\t"
3037
            "movq   8(%2,%0), %%mm1 \n\t"
3038
            "pfmul   (%3,%0), %%mm0 \n\t"
3039
            "pfmul  8(%3,%0), %%mm1 \n\t"
3040
            "pfadd   (%4,%0), %%mm0 \n\t"
3041
            "pfadd  8(%4,%0), %%mm1 \n\t"
3042
            "movq  %%mm0,   (%1,%0) \n\t"
3043
            "movq  %%mm1,  8(%1,%0) \n\t"
3044
            "sub  $16, %0 \n\t"
3045
            "jge  1b \n\t"
3046
            :"+r"(i)
3047
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3048
            :"memory"
3049
        );
3050
    }
3051
    else
3052
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3053
    asm volatile("femms");
3054
}
3055
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3056
                                    const float *src2, int src3, int len, int step){
3057
    long i = (len-8)*4;
3058
    if(step == 2 && src3 == 0){
3059
        dst += (len-8)*2;
3060
        asm volatile(
3061
            "1: \n\t"
3062
            "movaps   (%2,%0), %%xmm0 \n\t"
3063
            "movaps 16(%2,%0), %%xmm1 \n\t"
3064
            "mulps    (%3,%0), %%xmm0 \n\t"
3065
            "mulps  16(%3,%0), %%xmm1 \n\t"
3066
            "addps    (%4,%0), %%xmm0 \n\t"
3067
            "addps  16(%4,%0), %%xmm1 \n\t"
3068
            "movss     %%xmm0,   (%1) \n\t"
3069
            "movss     %%xmm1, 32(%1) \n\t"
3070
            "movhlps   %%xmm0, %%xmm2 \n\t"
3071
            "movhlps   %%xmm1, %%xmm3 \n\t"
3072
            "movss     %%xmm2, 16(%1) \n\t"
3073
            "movss     %%xmm3, 48(%1) \n\t"
3074
            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3075
            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3076
            "movss     %%xmm0,  8(%1) \n\t"
3077
            "movss     %%xmm1, 40(%1) \n\t"
3078
            "movhlps   %%xmm0, %%xmm2 \n\t"
3079
            "movhlps   %%xmm1, %%xmm3 \n\t"
3080
            "movss     %%xmm2, 24(%1) \n\t"
3081
            "movss     %%xmm3, 56(%1) \n\t"
3082
            "sub  $64, %1 \n\t"
3083
            "sub  $32, %0 \n\t"
3084
            "jge  1b \n\t"
3085
            :"+r"(i), "+r"(dst)
3086
            :"r"(src0), "r"(src1), "r"(src2)
3087
            :"memory"
3088
        );
3089
    }
3090
    else if(step == 1 && src3 == 0){
3091
        asm volatile(
3092
            "1: \n\t"
3093
            "movaps   (%2,%0), %%xmm0 \n\t"
3094
            "movaps 16(%2,%0), %%xmm1 \n\t"
3095
            "mulps    (%3,%0), %%xmm0 \n\t"
3096
            "mulps  16(%3,%0), %%xmm1 \n\t"
3097
            "addps    (%4,%0), %%xmm0 \n\t"
3098
            "addps  16(%4,%0), %%xmm1 \n\t"
3099
            "movaps %%xmm0,   (%1,%0) \n\t"
3100
            "movaps %%xmm1, 16(%1,%0) \n\t"
3101
            "sub  $32, %0 \n\t"
3102
            "jge  1b \n\t"
3103
            :"+r"(i)
3104
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3105
            :"memory"
3106
        );
3107
    }
3108
    else
3109
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3110
}
3111

    
3112
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3113
    // not bit-exact: pf2id uses different rounding than C and SSE
3114
    int i;
3115
    for(i=0; i<len; i+=4) {
3116
        asm volatile(
3117
            "pf2id       %1, %%mm0 \n\t"
3118
            "pf2id       %2, %%mm1 \n\t"
3119
            "packssdw %%mm1, %%mm0 \n\t"
3120
            "movq     %%mm0, %0    \n\t"
3121
            :"=m"(dst[i])
3122
            :"m"(src[i]), "m"(src[i+2])
3123
        );
3124
    }
3125
    asm volatile("femms");
3126
}
3127
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3128
    int i;
3129
    for(i=0; i<len; i+=4) {
3130
        asm volatile(
3131
            "cvtps2pi    %1, %%mm0 \n\t"
3132
            "cvtps2pi    %2, %%mm1 \n\t"
3133
            "packssdw %%mm1, %%mm0 \n\t"
3134
            "movq     %%mm0, %0    \n\t"
3135
            :"=m"(dst[i])
3136
            :"m"(src[i]), "m"(src[i+2])
3137
        );
3138
    }
3139
    asm volatile("emms");
3140
}
3141

    
3142
extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
3143
extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
3144
extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
3145
extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
3146
extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3147
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3148
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3149
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3150

    
3151
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3152
{
3153
    mm_flags = mm_support();
3154

    
3155
    if (avctx->dsp_mask) {
3156
        if (avctx->dsp_mask & FF_MM_FORCE)
3157
            mm_flags |= (avctx->dsp_mask & 0xffff);
3158
        else
3159
            mm_flags &= ~(avctx->dsp_mask & 0xffff);
3160
    }
3161

    
3162
#if 0
3163
    av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3164
    if (mm_flags & MM_MMX)
3165
        av_log(avctx, AV_LOG_INFO, " mmx");
3166
    if (mm_flags & MM_MMXEXT)
3167
        av_log(avctx, AV_LOG_INFO, " mmxext");
3168
    if (mm_flags & MM_3DNOW)
3169
        av_log(avctx, AV_LOG_INFO, " 3dnow");
3170
    if (mm_flags & MM_SSE)
3171
        av_log(avctx, AV_LOG_INFO, " sse");
3172
    if (mm_flags & MM_SSE2)
3173
        av_log(avctx, AV_LOG_INFO, " sse2");
3174
    av_log(avctx, AV_LOG_INFO, "\n");
3175
#endif
3176

    
3177
    if (mm_flags & MM_MMX) {
3178
        const int idct_algo= avctx->idct_algo;
3179

    
3180
#ifdef CONFIG_ENCODERS
3181
        const int dct_algo = avctx->dct_algo;
3182
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3183
            if(mm_flags & MM_SSE2){
3184
                c->fdct = ff_fdct_sse2;
3185
            }else if(mm_flags & MM_MMXEXT){
3186
                c->fdct = ff_fdct_mmx2;
3187
            }else{
3188
                c->fdct = ff_fdct_mmx;
3189
            }
3190
        }
3191
#endif //CONFIG_ENCODERS
3192
        if(avctx->lowres==0){
3193
            if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3194
                c->idct_put= ff_simple_idct_put_mmx;
3195
                c->idct_add= ff_simple_idct_add_mmx;
3196
                c->idct    = ff_simple_idct_mmx;
3197
                c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3198
#ifdef CONFIG_GPL
3199
            }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3200
                if(mm_flags & MM_MMXEXT){
3201
                    c->idct_put= ff_libmpeg2mmx2_idct_put;
3202
                    c->idct_add= ff_libmpeg2mmx2_idct_add;
3203
                    c->idct    = ff_mmxext_idct;
3204
                }else{
3205
                    c->idct_put= ff_libmpeg2mmx_idct_put;
3206
                    c->idct_add= ff_libmpeg2mmx_idct_add;
3207
                    c->idct    = ff_mmx_idct;
3208
                }
3209
                c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3210
#endif
3211
            }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
3212
                     idct_algo==FF_IDCT_VP3 &&
3213
                     avctx->codec->id!=CODEC_ID_THEORA &&
3214
                     !(avctx->flags & CODEC_FLAG_BITEXACT)){
3215
                if(mm_flags & MM_SSE2){
3216
                    c->idct_put= ff_vp3_idct_put_sse2;
3217
                    c->idct_add= ff_vp3_idct_add_sse2;
3218
                    c->idct    = ff_vp3_idct_sse2;
3219
                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3220
                }else{
3221
                    ff_vp3_dsp_init_mmx();
3222
                    c->idct_put= ff_vp3_idct_put_mmx;
3223
                    c->idct_add= ff_vp3_idct_add_mmx;
3224
                    c->idct    = ff_vp3_idct_mmx;
3225
                    c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3226
                }
3227
            }else if(idct_algo==FF_IDCT_CAVS){
3228
                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3229
            }else if(idct_algo==FF_IDCT_XVIDMMX){
3230
                if(mm_flags & MM_MMXEXT){
3231
                    c->idct_put= ff_idct_xvid_mmx2_put;
3232
                    c->idct_add= ff_idct_xvid_mmx2_add;
3233
                    c->idct    = ff_idct_xvid_mmx2;
3234
                }else{
3235
                    c->idct_put= ff_idct_xvid_mmx_put;
3236