Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ ce611a27

History | View | Annotate | Download (138 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "dsputil.h"
26
#include "simple_idct.h"
27
#include "mpegvideo.h"
28
#include "x86_cpu.h"
29
#include "mmx.h"
30
#include "vp3dsp_mmx.h"
31
#include "vp3dsp_sse2.h"
32
#include "h263.h"
33

    
34
//#undef NDEBUG
35
//#include <assert.h>
36

    
37
extern void ff_idct_xvid_mmx(short *block);
38
extern void ff_idct_xvid_mmx2(short *block);
39

    
40
int mm_flags; /* multimedia extension flags */
41

    
42
/* pixel operations */
43
static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
44
static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
45
static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
46

    
47
static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
48
{0x8000000080000000ULL, 0x8000000080000000ULL};
49

    
50
static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
51
static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
52
static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
53
static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
54
static const uint64_t ff_pw_8  attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
55
static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
56
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
57
static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
58
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
59

    
60
static const uint64_t ff_pb_1  attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
61
static const uint64_t ff_pb_3  attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
62
static const uint64_t ff_pb_7  attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
63
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
64
static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
65
static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
66
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
67

    
68
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
69
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
70

    
71
#define MOVQ_WONE(regd) \
72
    __asm __volatile ( \
73
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
74
    "psrlw $15, %%" #regd ::)
75

    
76
#define MOVQ_BFE(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
79
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
80

    
81
#ifndef PIC
82
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
83
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
84
#else
85
// for shared library it's better to use this way for accessing constants
86
// pcmpeqd -> -1
87
#define MOVQ_BONE(regd) \
88
    __asm __volatile ( \
89
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
90
    "psrlw $15, %%" #regd " \n\t" \
91
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
92

    
93
#define MOVQ_WTWO(regd) \
94
    __asm __volatile ( \
95
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
96
    "psrlw $15, %%" #regd " \n\t" \
97
    "psllw $1, %%" #regd " \n\t"::)
98

    
99
#endif
100

    
101
// using regr as temporary and for the output result
102
// first argument is unmodifed and second is trashed
103
// regfe is supposed to contain 0xfefefefefefefefe
104
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
105
    "movq " #rega ", " #regr "  \n\t"\
106
    "pand " #regb ", " #regr "  \n\t"\
107
    "pxor " #rega ", " #regb "  \n\t"\
108
    "pand " #regfe "," #regb "  \n\t"\
109
    "psrlq $1, " #regb "        \n\t"\
110
    "paddb " #regb ", " #regr " \n\t"
111

    
112
#define PAVGB_MMX(rega, regb, regr, regfe) \
113
    "movq " #rega ", " #regr "  \n\t"\
114
    "por  " #regb ", " #regr "  \n\t"\
115
    "pxor " #rega ", " #regb "  \n\t"\
116
    "pand " #regfe "," #regb "  \n\t"\
117
    "psrlq $1, " #regb "        \n\t"\
118
    "psubb " #regb ", " #regr " \n\t"
119

    
120
// mm6 is supposed to contain 0xfefefefefefefefe
121
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
122
    "movq " #rega ", " #regr "  \n\t"\
123
    "movq " #regc ", " #regp "  \n\t"\
124
    "pand " #regb ", " #regr "  \n\t"\
125
    "pand " #regd ", " #regp "  \n\t"\
126
    "pxor " #rega ", " #regb "  \n\t"\
127
    "pxor " #regc ", " #regd "  \n\t"\
128
    "pand %%mm6, " #regb "      \n\t"\
129
    "pand %%mm6, " #regd "      \n\t"\
130
    "psrlq $1, " #regb "        \n\t"\
131
    "psrlq $1, " #regd "        \n\t"\
132
    "paddb " #regb ", " #regr " \n\t"\
133
    "paddb " #regd ", " #regp " \n\t"
134

    
135
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
136
    "movq " #rega ", " #regr "  \n\t"\
137
    "movq " #regc ", " #regp "  \n\t"\
138
    "por  " #regb ", " #regr "  \n\t"\
139
    "por  " #regd ", " #regp "  \n\t"\
140
    "pxor " #rega ", " #regb "  \n\t"\
141
    "pxor " #regc ", " #regd "  \n\t"\
142
    "pand %%mm6, " #regb "      \n\t"\
143
    "pand %%mm6, " #regd "      \n\t"\
144
    "psrlq $1, " #regd "        \n\t"\
145
    "psrlq $1, " #regb "        \n\t"\
146
    "psubb " #regb ", " #regr " \n\t"\
147
    "psubb " #regd ", " #regp " \n\t"
148

    
149
/***********************************/
150
/* MMX no rounding */
151
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
152
#define SET_RND  MOVQ_WONE
153
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
154
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
155

    
156
#include "dsputil_mmx_rnd.h"
157

    
158
#undef DEF
159
#undef SET_RND
160
#undef PAVGBP
161
#undef PAVGB
162
/***********************************/
163
/* MMX rounding */
164

    
165
#define DEF(x, y) x ## _ ## y ##_mmx
166
#define SET_RND  MOVQ_WTWO
167
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
168
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
169

    
170
#include "dsputil_mmx_rnd.h"
171

    
172
#undef DEF
173
#undef SET_RND
174
#undef PAVGBP
175
#undef PAVGB
176

    
177
/***********************************/
178
/* 3Dnow specific */
179

    
180
#define DEF(x) x ## _3dnow
181
#define PAVGB "pavgusb"
182

    
183
#include "dsputil_mmx_avg.h"
184

    
185
#undef DEF
186
#undef PAVGB
187

    
188
/***********************************/
189
/* MMX2 specific */
190

    
191
#define DEF(x) x ## _mmx2
192

    
193
/* Introduced only in MMX2 set */
194
#define PAVGB "pavgb"
195

    
196
#include "dsputil_mmx_avg.h"
197

    
198
#undef DEF
199
#undef PAVGB
200

    
201
#define SBUTTERFLY(a,b,t,n,m)\
202
    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
203
    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
204
    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
205

    
206
#define TRANSPOSE4(a,b,c,d,t)\
207
    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
208
    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
209
    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
210
    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
211

    
212
/***********************************/
213
/* standard MMX */
214

    
215
#ifdef CONFIG_ENCODERS
216
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
217
{
218
    asm volatile(
219
        "mov $-128, %%"REG_a"           \n\t"
220
        "pxor %%mm7, %%mm7              \n\t"
221
        ASMALIGN(4)
222
        "1:                             \n\t"
223
        "movq (%0), %%mm0               \n\t"
224
        "movq (%0, %2), %%mm2           \n\t"
225
        "movq %%mm0, %%mm1              \n\t"
226
        "movq %%mm2, %%mm3              \n\t"
227
        "punpcklbw %%mm7, %%mm0         \n\t"
228
        "punpckhbw %%mm7, %%mm1         \n\t"
229
        "punpcklbw %%mm7, %%mm2         \n\t"
230
        "punpckhbw %%mm7, %%mm3         \n\t"
231
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
232
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
233
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
234
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
235
        "add %3, %0                     \n\t"
236
        "add $32, %%"REG_a"             \n\t"
237
        "js 1b                          \n\t"
238
        : "+r" (pixels)
239
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
240
        : "%"REG_a
241
    );
242
}
243

    
244
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
245
{
246
    asm volatile(
247
        "pxor %%mm7, %%mm7              \n\t"
248
        "mov $-128, %%"REG_a"           \n\t"
249
        ASMALIGN(4)
250
        "1:                             \n\t"
251
        "movq (%0), %%mm0               \n\t"
252
        "movq (%1), %%mm2               \n\t"
253
        "movq %%mm0, %%mm1              \n\t"
254
        "movq %%mm2, %%mm3              \n\t"
255
        "punpcklbw %%mm7, %%mm0         \n\t"
256
        "punpckhbw %%mm7, %%mm1         \n\t"
257
        "punpcklbw %%mm7, %%mm2         \n\t"
258
        "punpckhbw %%mm7, %%mm3         \n\t"
259
        "psubw %%mm2, %%mm0             \n\t"
260
        "psubw %%mm3, %%mm1             \n\t"
261
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
262
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
263
        "add %3, %0                     \n\t"
264
        "add %3, %1                     \n\t"
265
        "add $16, %%"REG_a"             \n\t"
266
        "jnz 1b                         \n\t"
267
        : "+r" (s1), "+r" (s2)
268
        : "r" (block+64), "r" ((long)stride)
269
        : "%"REG_a
270
    );
271
}
272
#endif //CONFIG_ENCODERS
273

    
274
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
275
{
276
    const DCTELEM *p;
277
    uint8_t *pix;
278

    
279
    /* read the pixels */
280
    p = block;
281
    pix = pixels;
282
    /* unrolled loop */
283
        __asm __volatile(
284
                "movq   %3, %%mm0               \n\t"
285
                "movq   8%3, %%mm1              \n\t"
286
                "movq   16%3, %%mm2             \n\t"
287
                "movq   24%3, %%mm3             \n\t"
288
                "movq   32%3, %%mm4             \n\t"
289
                "movq   40%3, %%mm5             \n\t"
290
                "movq   48%3, %%mm6             \n\t"
291
                "movq   56%3, %%mm7             \n\t"
292
                "packuswb %%mm1, %%mm0          \n\t"
293
                "packuswb %%mm3, %%mm2          \n\t"
294
                "packuswb %%mm5, %%mm4          \n\t"
295
                "packuswb %%mm7, %%mm6          \n\t"
296
                "movq   %%mm0, (%0)             \n\t"
297
                "movq   %%mm2, (%0, %1)         \n\t"
298
                "movq   %%mm4, (%0, %1, 2)      \n\t"
299
                "movq   %%mm6, (%0, %2)         \n\t"
300
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
301
                :"memory");
302
        pix += line_size*4;
303
        p += 32;
304

    
305
    // if here would be an exact copy of the code above
306
    // compiler would generate some very strange code
307
    // thus using "r"
308
    __asm __volatile(
309
            "movq       (%3), %%mm0             \n\t"
310
            "movq       8(%3), %%mm1            \n\t"
311
            "movq       16(%3), %%mm2           \n\t"
312
            "movq       24(%3), %%mm3           \n\t"
313
            "movq       32(%3), %%mm4           \n\t"
314
            "movq       40(%3), %%mm5           \n\t"
315
            "movq       48(%3), %%mm6           \n\t"
316
            "movq       56(%3), %%mm7           \n\t"
317
            "packuswb %%mm1, %%mm0              \n\t"
318
            "packuswb %%mm3, %%mm2              \n\t"
319
            "packuswb %%mm5, %%mm4              \n\t"
320
            "packuswb %%mm7, %%mm6              \n\t"
321
            "movq       %%mm0, (%0)             \n\t"
322
            "movq       %%mm2, (%0, %1)         \n\t"
323
            "movq       %%mm4, (%0, %1, 2)      \n\t"
324
            "movq       %%mm6, (%0, %2)         \n\t"
325
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
326
            :"memory");
327
}
328

    
329
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
330
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
331

    
332
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
333
{
334
    int i;
335

    
336
    movq_m2r(*vector128, mm1);
337
    for (i = 0; i < 8; i++) {
338
        movq_m2r(*(block), mm0);
339
        packsswb_m2r(*(block + 4), mm0);
340
        block += 8;
341
        paddb_r2r(mm1, mm0);
342
        movq_r2m(mm0, *pixels);
343
        pixels += line_size;
344
    }
345
}
346

    
347
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
348
{
349
    const DCTELEM *p;
350
    uint8_t *pix;
351
    int i;
352

    
353
    /* read the pixels */
354
    p = block;
355
    pix = pixels;
356
    MOVQ_ZERO(mm7);
357
    i = 4;
358
    do {
359
        __asm __volatile(
360
                "movq   (%2), %%mm0     \n\t"
361
                "movq   8(%2), %%mm1    \n\t"
362
                "movq   16(%2), %%mm2   \n\t"
363
                "movq   24(%2), %%mm3   \n\t"
364
                "movq   %0, %%mm4       \n\t"
365
                "movq   %1, %%mm6       \n\t"
366
                "movq   %%mm4, %%mm5    \n\t"
367
                "punpcklbw %%mm7, %%mm4 \n\t"
368
                "punpckhbw %%mm7, %%mm5 \n\t"
369
                "paddsw %%mm4, %%mm0    \n\t"
370
                "paddsw %%mm5, %%mm1    \n\t"
371
                "movq   %%mm6, %%mm5    \n\t"
372
                "punpcklbw %%mm7, %%mm6 \n\t"
373
                "punpckhbw %%mm7, %%mm5 \n\t"
374
                "paddsw %%mm6, %%mm2    \n\t"
375
                "paddsw %%mm5, %%mm3    \n\t"
376
                "packuswb %%mm1, %%mm0  \n\t"
377
                "packuswb %%mm3, %%mm2  \n\t"
378
                "movq   %%mm0, %0       \n\t"
379
                "movq   %%mm2, %1       \n\t"
380
                :"+m"(*pix), "+m"(*(pix+line_size))
381
                :"r"(p)
382
                :"memory");
383
        pix += line_size*2;
384
        p += 16;
385
    } while (--i);
386
}
387

    
388
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
389
{
390
    __asm __volatile(
391
         "lea (%3, %3), %%"REG_a"       \n\t"
392
         ASMALIGN(3)
393
         "1:                            \n\t"
394
         "movd (%1), %%mm0              \n\t"
395
         "movd (%1, %3), %%mm1          \n\t"
396
         "movd %%mm0, (%2)              \n\t"
397
         "movd %%mm1, (%2, %3)          \n\t"
398
         "add %%"REG_a", %1             \n\t"
399
         "add %%"REG_a", %2             \n\t"
400
         "movd (%1), %%mm0              \n\t"
401
         "movd (%1, %3), %%mm1          \n\t"
402
         "movd %%mm0, (%2)              \n\t"
403
         "movd %%mm1, (%2, %3)          \n\t"
404
         "add %%"REG_a", %1             \n\t"
405
         "add %%"REG_a", %2             \n\t"
406
         "subl $4, %0                   \n\t"
407
         "jnz 1b                        \n\t"
408
         : "+g"(h), "+r" (pixels),  "+r" (block)
409
         : "r"((long)line_size)
410
         : "%"REG_a, "memory"
411
        );
412
}
413

    
414
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
415
{
416
    __asm __volatile(
417
         "lea (%3, %3), %%"REG_a"       \n\t"
418
         ASMALIGN(3)
419
         "1:                            \n\t"
420
         "movq (%1), %%mm0              \n\t"
421
         "movq (%1, %3), %%mm1          \n\t"
422
         "movq %%mm0, (%2)              \n\t"
423
         "movq %%mm1, (%2, %3)          \n\t"
424
         "add %%"REG_a", %1             \n\t"
425
         "add %%"REG_a", %2             \n\t"
426
         "movq (%1), %%mm0              \n\t"
427
         "movq (%1, %3), %%mm1          \n\t"
428
         "movq %%mm0, (%2)              \n\t"
429
         "movq %%mm1, (%2, %3)          \n\t"
430
         "add %%"REG_a", %1             \n\t"
431
         "add %%"REG_a", %2             \n\t"
432
         "subl $4, %0                   \n\t"
433
         "jnz 1b                        \n\t"
434
         : "+g"(h), "+r" (pixels),  "+r" (block)
435
         : "r"((long)line_size)
436
         : "%"REG_a, "memory"
437
        );
438
}
439

    
440
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
441
{
442
    __asm __volatile(
443
         "lea (%3, %3), %%"REG_a"       \n\t"
444
         ASMALIGN(3)
445
         "1:                            \n\t"
446
         "movq (%1), %%mm0              \n\t"
447
         "movq 8(%1), %%mm4             \n\t"
448
         "movq (%1, %3), %%mm1          \n\t"
449
         "movq 8(%1, %3), %%mm5         \n\t"
450
         "movq %%mm0, (%2)              \n\t"
451
         "movq %%mm4, 8(%2)             \n\t"
452
         "movq %%mm1, (%2, %3)          \n\t"
453
         "movq %%mm5, 8(%2, %3)         \n\t"
454
         "add %%"REG_a", %1             \n\t"
455
         "add %%"REG_a", %2             \n\t"
456
         "movq (%1), %%mm0              \n\t"
457
         "movq 8(%1), %%mm4             \n\t"
458
         "movq (%1, %3), %%mm1          \n\t"
459
         "movq 8(%1, %3), %%mm5         \n\t"
460
         "movq %%mm0, (%2)              \n\t"
461
         "movq %%mm4, 8(%2)             \n\t"
462
         "movq %%mm1, (%2, %3)          \n\t"
463
         "movq %%mm5, 8(%2, %3)         \n\t"
464
         "add %%"REG_a", %1             \n\t"
465
         "add %%"REG_a", %2             \n\t"
466
         "subl $4, %0                   \n\t"
467
         "jnz 1b                        \n\t"
468
         : "+g"(h), "+r" (pixels),  "+r" (block)
469
         : "r"((long)line_size)
470
         : "%"REG_a, "memory"
471
        );
472
}
473

    
474
static void clear_blocks_mmx(DCTELEM *blocks)
475
{
476
    __asm __volatile(
477
                "pxor %%mm7, %%mm7              \n\t"
478
                "mov $-128*6, %%"REG_a"         \n\t"
479
                "1:                             \n\t"
480
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
481
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
482
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
483
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
484
                "add $32, %%"REG_a"             \n\t"
485
                " js 1b                         \n\t"
486
                : : "r" (((uint8_t *)blocks)+128*6)
487
                : "%"REG_a
488
        );
489
}
490

    
491
#ifdef CONFIG_ENCODERS
492
static int pix_sum16_mmx(uint8_t * pix, int line_size){
493
    const int h=16;
494
    int sum;
495
    long index= -line_size*h;
496

    
497
    __asm __volatile(
498
                "pxor %%mm7, %%mm7              \n\t"
499
                "pxor %%mm6, %%mm6              \n\t"
500
                "1:                             \n\t"
501
                "movq (%2, %1), %%mm0           \n\t"
502
                "movq (%2, %1), %%mm1           \n\t"
503
                "movq 8(%2, %1), %%mm2          \n\t"
504
                "movq 8(%2, %1), %%mm3          \n\t"
505
                "punpcklbw %%mm7, %%mm0         \n\t"
506
                "punpckhbw %%mm7, %%mm1         \n\t"
507
                "punpcklbw %%mm7, %%mm2         \n\t"
508
                "punpckhbw %%mm7, %%mm3         \n\t"
509
                "paddw %%mm0, %%mm1             \n\t"
510
                "paddw %%mm2, %%mm3             \n\t"
511
                "paddw %%mm1, %%mm3             \n\t"
512
                "paddw %%mm3, %%mm6             \n\t"
513
                "add %3, %1                     \n\t"
514
                " js 1b                         \n\t"
515
                "movq %%mm6, %%mm5              \n\t"
516
                "psrlq $32, %%mm6               \n\t"
517
                "paddw %%mm5, %%mm6             \n\t"
518
                "movq %%mm6, %%mm5              \n\t"
519
                "psrlq $16, %%mm6               \n\t"
520
                "paddw %%mm5, %%mm6             \n\t"
521
                "movd %%mm6, %0                 \n\t"
522
                "andl $0xFFFF, %0               \n\t"
523
                : "=&r" (sum), "+r" (index)
524
                : "r" (pix - index), "r" ((long)line_size)
525
        );
526

    
527
        return sum;
528
}
529
#endif //CONFIG_ENCODERS
530

    
531
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
532
    long i=0;
533
    asm volatile(
534
        "1:                             \n\t"
535
        "movq  (%1, %0), %%mm0          \n\t"
536
        "movq  (%2, %0), %%mm1          \n\t"
537
        "paddb %%mm0, %%mm1             \n\t"
538
        "movq %%mm1, (%2, %0)           \n\t"
539
        "movq 8(%1, %0), %%mm0          \n\t"
540
        "movq 8(%2, %0), %%mm1          \n\t"
541
        "paddb %%mm0, %%mm1             \n\t"
542
        "movq %%mm1, 8(%2, %0)          \n\t"
543
        "add $16, %0                    \n\t"
544
        "cmp %3, %0                     \n\t"
545
        " jb 1b                         \n\t"
546
        : "+r" (i)
547
        : "r"(src), "r"(dst), "r"((long)w-15)
548
    );
549
    for(; i<w; i++)
550
        dst[i+0] += src[i+0];
551
}
552

    
553
#define H263_LOOP_FILTER \
554
        "pxor %%mm7, %%mm7              \n\t"\
555
        "movq  %0, %%mm0                \n\t"\
556
        "movq  %0, %%mm1                \n\t"\
557
        "movq  %3, %%mm2                \n\t"\
558
        "movq  %3, %%mm3                \n\t"\
559
        "punpcklbw %%mm7, %%mm0         \n\t"\
560
        "punpckhbw %%mm7, %%mm1         \n\t"\
561
        "punpcklbw %%mm7, %%mm2         \n\t"\
562
        "punpckhbw %%mm7, %%mm3         \n\t"\
563
        "psubw %%mm2, %%mm0             \n\t"\
564
        "psubw %%mm3, %%mm1             \n\t"\
565
        "movq  %1, %%mm2                \n\t"\
566
        "movq  %1, %%mm3                \n\t"\
567
        "movq  %2, %%mm4                \n\t"\
568
        "movq  %2, %%mm5                \n\t"\
569
        "punpcklbw %%mm7, %%mm2         \n\t"\
570
        "punpckhbw %%mm7, %%mm3         \n\t"\
571
        "punpcklbw %%mm7, %%mm4         \n\t"\
572
        "punpckhbw %%mm7, %%mm5         \n\t"\
573
        "psubw %%mm2, %%mm4             \n\t"\
574
        "psubw %%mm3, %%mm5             \n\t"\
575
        "psllw $2, %%mm4                \n\t"\
576
        "psllw $2, %%mm5                \n\t"\
577
        "paddw %%mm0, %%mm4             \n\t"\
578
        "paddw %%mm1, %%mm5             \n\t"\
579
        "pxor %%mm6, %%mm6              \n\t"\
580
        "pcmpgtw %%mm4, %%mm6           \n\t"\
581
        "pcmpgtw %%mm5, %%mm7           \n\t"\
582
        "pxor %%mm6, %%mm4              \n\t"\
583
        "pxor %%mm7, %%mm5              \n\t"\
584
        "psubw %%mm6, %%mm4             \n\t"\
585
        "psubw %%mm7, %%mm5             \n\t"\
586
        "psrlw $3, %%mm4                \n\t"\
587
        "psrlw $3, %%mm5                \n\t"\
588
        "packuswb %%mm5, %%mm4          \n\t"\
589
        "packsswb %%mm7, %%mm6          \n\t"\
590
        "pxor %%mm7, %%mm7              \n\t"\
591
        "movd %4, %%mm2                 \n\t"\
592
        "punpcklbw %%mm2, %%mm2         \n\t"\
593
        "punpcklbw %%mm2, %%mm2         \n\t"\
594
        "punpcklbw %%mm2, %%mm2         \n\t"\
595
        "psubusb %%mm4, %%mm2           \n\t"\
596
        "movq %%mm2, %%mm3              \n\t"\
597
        "psubusb %%mm4, %%mm3           \n\t"\
598
        "psubb %%mm3, %%mm2             \n\t"\
599
        "movq %1, %%mm3                 \n\t"\
600
        "movq %2, %%mm4                 \n\t"\
601
        "pxor %%mm6, %%mm3              \n\t"\
602
        "pxor %%mm6, %%mm4              \n\t"\
603
        "paddusb %%mm2, %%mm3           \n\t"\
604
        "psubusb %%mm2, %%mm4           \n\t"\
605
        "pxor %%mm6, %%mm3              \n\t"\
606
        "pxor %%mm6, %%mm4              \n\t"\
607
        "paddusb %%mm2, %%mm2           \n\t"\
608
        "packsswb %%mm1, %%mm0          \n\t"\
609
        "pcmpgtb %%mm0, %%mm7           \n\t"\
610
        "pxor %%mm7, %%mm0              \n\t"\
611
        "psubb %%mm7, %%mm0             \n\t"\
612
        "movq %%mm0, %%mm1              \n\t"\
613
        "psubusb %%mm2, %%mm0           \n\t"\
614
        "psubb %%mm0, %%mm1             \n\t"\
615
        "pand %5, %%mm1                 \n\t"\
616
        "psrlw $2, %%mm1                \n\t"\
617
        "pxor %%mm7, %%mm1              \n\t"\
618
        "psubb %%mm7, %%mm1             \n\t"\
619
        "movq %0, %%mm5                 \n\t"\
620
        "movq %3, %%mm6                 \n\t"\
621
        "psubb %%mm1, %%mm5             \n\t"\
622
        "paddb %%mm1, %%mm6             \n\t"
623

    
624
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
625
    if(ENABLE_ANY_H263) {
626
    const int strength= ff_h263_loop_filter_strength[qscale];
627

    
628
    asm volatile(
629

    
630
        H263_LOOP_FILTER
631

    
632
        "movq %%mm3, %1                 \n\t"
633
        "movq %%mm4, %2                 \n\t"
634
        "movq %%mm5, %0                 \n\t"
635
        "movq %%mm6, %3                 \n\t"
636
        : "+m" (*(uint64_t*)(src - 2*stride)),
637
          "+m" (*(uint64_t*)(src - 1*stride)),
638
          "+m" (*(uint64_t*)(src + 0*stride)),
639
          "+m" (*(uint64_t*)(src + 1*stride))
640
        : "g" (2*strength), "m"(ff_pb_FC)
641
    );
642
    }
643
}
644

    
645
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
646
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
647
        "movd  %4, %%mm0                \n\t"
648
        "movd  %5, %%mm1                \n\t"
649
        "movd  %6, %%mm2                \n\t"
650
        "movd  %7, %%mm3                \n\t"
651
        "punpcklbw %%mm1, %%mm0         \n\t"
652
        "punpcklbw %%mm3, %%mm2         \n\t"
653
        "movq %%mm0, %%mm1              \n\t"
654
        "punpcklwd %%mm2, %%mm0         \n\t"
655
        "punpckhwd %%mm2, %%mm1         \n\t"
656
        "movd  %%mm0, %0                \n\t"
657
        "punpckhdq %%mm0, %%mm0         \n\t"
658
        "movd  %%mm0, %1                \n\t"
659
        "movd  %%mm1, %2                \n\t"
660
        "punpckhdq %%mm1, %%mm1         \n\t"
661
        "movd  %%mm1, %3                \n\t"
662

    
663
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
664
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
665
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
666
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
667
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
668
           "m" (*(uint32_t*)(src + 1*src_stride)),
669
           "m" (*(uint32_t*)(src + 2*src_stride)),
670
           "m" (*(uint32_t*)(src + 3*src_stride))
671
    );
672
}
673

    
674
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
675
    if(ENABLE_ANY_H263) {
676
    const int strength= ff_h263_loop_filter_strength[qscale];
677
    uint64_t temp[4] __attribute__ ((aligned(8)));
678
    uint8_t *btemp= (uint8_t*)temp;
679

    
680
    src -= 2;
681

    
682
    transpose4x4(btemp  , src           , 8, stride);
683
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
684
    asm volatile(
685
        H263_LOOP_FILTER // 5 3 4 6
686

    
687
        : "+m" (temp[0]),
688
          "+m" (temp[1]),
689
          "+m" (temp[2]),
690
          "+m" (temp[3])
691
        : "g" (2*strength), "m"(ff_pb_FC)
692
    );
693

    
694
    asm volatile(
695
        "movq %%mm5, %%mm1              \n\t"
696
        "movq %%mm4, %%mm0              \n\t"
697
        "punpcklbw %%mm3, %%mm5         \n\t"
698
        "punpcklbw %%mm6, %%mm4         \n\t"
699
        "punpckhbw %%mm3, %%mm1         \n\t"
700
        "punpckhbw %%mm6, %%mm0         \n\t"
701
        "movq %%mm5, %%mm3              \n\t"
702
        "movq %%mm1, %%mm6              \n\t"
703
        "punpcklwd %%mm4, %%mm5         \n\t"
704
        "punpcklwd %%mm0, %%mm1         \n\t"
705
        "punpckhwd %%mm4, %%mm3         \n\t"
706
        "punpckhwd %%mm0, %%mm6         \n\t"
707
        "movd %%mm5, (%0)               \n\t"
708
        "punpckhdq %%mm5, %%mm5         \n\t"
709
        "movd %%mm5, (%0,%2)            \n\t"
710
        "movd %%mm3, (%0,%2,2)          \n\t"
711
        "punpckhdq %%mm3, %%mm3         \n\t"
712
        "movd %%mm3, (%0,%3)            \n\t"
713
        "movd %%mm1, (%1)               \n\t"
714
        "punpckhdq %%mm1, %%mm1         \n\t"
715
        "movd %%mm1, (%1,%2)            \n\t"
716
        "movd %%mm6, (%1,%2,2)          \n\t"
717
        "punpckhdq %%mm6, %%mm6         \n\t"
718
        "movd %%mm6, (%1,%3)            \n\t"
719
        :: "r" (src),
720
           "r" (src + 4*stride),
721
           "r" ((long)   stride ),
722
           "r" ((long)(3*stride))
723
    );
724
    }
725
}
726

    
727
#ifdef CONFIG_ENCODERS
728
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
729
    int tmp;
730
  asm volatile (
731
      "movl $16,%%ecx\n"
732
      "pxor %%mm0,%%mm0\n"
733
      "pxor %%mm7,%%mm7\n"
734
      "1:\n"
735
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
736
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
737

    
738
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
739

    
740
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
741
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
742

    
743
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
744
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
745
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
746

    
747
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
748
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
749

    
750
      "pmaddwd %%mm3,%%mm3\n"
751
      "pmaddwd %%mm4,%%mm4\n"
752

    
753
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
754
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
755
      "paddd %%mm3,%%mm4\n"
756
      "paddd %%mm2,%%mm7\n"
757

    
758
      "add %2, %0\n"
759
      "paddd %%mm4,%%mm7\n"
760
      "dec %%ecx\n"
761
      "jnz 1b\n"
762

    
763
      "movq %%mm7,%%mm1\n"
764
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
765
      "paddd %%mm7,%%mm1\n"
766
      "movd %%mm1,%1\n"
767
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
768
    return tmp;
769
}
770

    
771
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
772
    int tmp;
773
  asm volatile (
774
      "movl %4,%%ecx\n"
775
      "shr $1,%%ecx\n"
776
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
777
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
778
      "1:\n"
779
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
780
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
781
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
782
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
783

    
784
      /* todo: mm1-mm2, mm3-mm4 */
785
      /* algo: substract mm1 from mm2 with saturation and vice versa */
786
      /*       OR the results to get absolute difference */
787
      "movq %%mm1,%%mm5\n"
788
      "movq %%mm3,%%mm6\n"
789
      "psubusb %%mm2,%%mm1\n"
790
      "psubusb %%mm4,%%mm3\n"
791
      "psubusb %%mm5,%%mm2\n"
792
      "psubusb %%mm6,%%mm4\n"
793

    
794
      "por %%mm1,%%mm2\n"
795
      "por %%mm3,%%mm4\n"
796

    
797
      /* now convert to 16-bit vectors so we can square them */
798
      "movq %%mm2,%%mm1\n"
799
      "movq %%mm4,%%mm3\n"
800

    
801
      "punpckhbw %%mm0,%%mm2\n"
802
      "punpckhbw %%mm0,%%mm4\n"
803
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
804
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
805

    
806
      "pmaddwd %%mm2,%%mm2\n"
807
      "pmaddwd %%mm4,%%mm4\n"
808
      "pmaddwd %%mm1,%%mm1\n"
809
      "pmaddwd %%mm3,%%mm3\n"
810

    
811
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
812
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
813

    
814
      "paddd %%mm2,%%mm1\n"
815
      "paddd %%mm4,%%mm3\n"
816
      "paddd %%mm1,%%mm7\n"
817
      "paddd %%mm3,%%mm7\n"
818

    
819
      "decl %%ecx\n"
820
      "jnz 1b\n"
821

    
822
      "movq %%mm7,%%mm1\n"
823
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
824
      "paddd %%mm7,%%mm1\n"
825
      "movd %%mm1,%2\n"
826
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
827
      : "r" ((long)line_size) , "m" (h)
828
      : "%ecx");
829
    return tmp;
830
}
831

    
832
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
833
    int tmp;
834
  asm volatile (
835
      "movl %4,%%ecx\n"
836
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
837
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
838
      "1:\n"
839
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
840
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
841
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
842
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
843

    
844
      /* todo: mm1-mm2, mm3-mm4 */
845
      /* algo: substract mm1 from mm2 with saturation and vice versa */
846
      /*       OR the results to get absolute difference */
847
      "movq %%mm1,%%mm5\n"
848
      "movq %%mm3,%%mm6\n"
849
      "psubusb %%mm2,%%mm1\n"
850
      "psubusb %%mm4,%%mm3\n"
851
      "psubusb %%mm5,%%mm2\n"
852
      "psubusb %%mm6,%%mm4\n"
853

    
854
      "por %%mm1,%%mm2\n"
855
      "por %%mm3,%%mm4\n"
856

    
857
      /* now convert to 16-bit vectors so we can square them */
858
      "movq %%mm2,%%mm1\n"
859
      "movq %%mm4,%%mm3\n"
860

    
861
      "punpckhbw %%mm0,%%mm2\n"
862
      "punpckhbw %%mm0,%%mm4\n"
863
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
864
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
865

    
866
      "pmaddwd %%mm2,%%mm2\n"
867
      "pmaddwd %%mm4,%%mm4\n"
868
      "pmaddwd %%mm1,%%mm1\n"
869
      "pmaddwd %%mm3,%%mm3\n"
870

    
871
      "add %3,%0\n"
872
      "add %3,%1\n"
873

    
874
      "paddd %%mm2,%%mm1\n"
875
      "paddd %%mm4,%%mm3\n"
876
      "paddd %%mm1,%%mm7\n"
877
      "paddd %%mm3,%%mm7\n"
878

    
879
      "decl %%ecx\n"
880
      "jnz 1b\n"
881

    
882
      "movq %%mm7,%%mm1\n"
883
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
884
      "paddd %%mm7,%%mm1\n"
885
      "movd %%mm1,%2\n"
886
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
887
      : "r" ((long)line_size) , "m" (h)
888
      : "%ecx");
889
    return tmp;
890
}
891

    
892
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
893
    int tmp;
894
  asm volatile (
895
      "shr $1,%2\n"
896
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
897
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
898
      "1:\n"
899
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
900
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
901
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
902
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
903

    
904
      /* todo: mm1-mm2, mm3-mm4 */
905
      /* algo: substract mm1 from mm2 with saturation and vice versa */
906
      /*       OR the results to get absolute difference */
907
      "movdqa %%xmm1,%%xmm5\n"
908
      "movdqa %%xmm3,%%xmm6\n"
909
      "psubusb %%xmm2,%%xmm1\n"
910
      "psubusb %%xmm4,%%xmm3\n"
911
      "psubusb %%xmm5,%%xmm2\n"
912
      "psubusb %%xmm6,%%xmm4\n"
913

    
914
      "por %%xmm1,%%xmm2\n"
915
      "por %%xmm3,%%xmm4\n"
916

    
917
      /* now convert to 16-bit vectors so we can square them */
918
      "movdqa %%xmm2,%%xmm1\n"
919
      "movdqa %%xmm4,%%xmm3\n"
920

    
921
      "punpckhbw %%xmm0,%%xmm2\n"
922
      "punpckhbw %%xmm0,%%xmm4\n"
923
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
924
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
925

    
926
      "pmaddwd %%xmm2,%%xmm2\n"
927
      "pmaddwd %%xmm4,%%xmm4\n"
928
      "pmaddwd %%xmm1,%%xmm1\n"
929
      "pmaddwd %%xmm3,%%xmm3\n"
930

    
931
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
932
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
933

    
934
      "paddd %%xmm2,%%xmm1\n"
935
      "paddd %%xmm4,%%xmm3\n"
936
      "paddd %%xmm1,%%xmm7\n"
937
      "paddd %%xmm3,%%xmm7\n"
938

    
939
      "decl %2\n"
940
      "jnz 1b\n"
941

    
942
      "movdqa %%xmm7,%%xmm1\n"
943
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
944
      "paddd %%xmm1,%%xmm7\n"
945
      "movdqa %%xmm7,%%xmm1\n"
946
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
947
      "paddd %%xmm1,%%xmm7\n"
948
      "movd %%xmm7,%3\n"
949
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
950
      : "r" ((long)line_size));
951
    return tmp;
952
}
953

    
954
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
955
    int tmp;
956
  asm volatile (
957
      "movl %3,%%ecx\n"
958
      "pxor %%mm7,%%mm7\n"
959
      "pxor %%mm6,%%mm6\n"
960

    
961
      "movq (%0),%%mm0\n"
962
      "movq %%mm0, %%mm1\n"
963
      "psllq $8, %%mm0\n"
964
      "psrlq $8, %%mm1\n"
965
      "psrlq $8, %%mm0\n"
966
      "movq %%mm0, %%mm2\n"
967
      "movq %%mm1, %%mm3\n"
968
      "punpcklbw %%mm7,%%mm0\n"
969
      "punpcklbw %%mm7,%%mm1\n"
970
      "punpckhbw %%mm7,%%mm2\n"
971
      "punpckhbw %%mm7,%%mm3\n"
972
      "psubw %%mm1, %%mm0\n"
973
      "psubw %%mm3, %%mm2\n"
974

    
975
      "add %2,%0\n"
976

    
977
      "movq (%0),%%mm4\n"
978
      "movq %%mm4, %%mm1\n"
979
      "psllq $8, %%mm4\n"
980
      "psrlq $8, %%mm1\n"
981
      "psrlq $8, %%mm4\n"
982
      "movq %%mm4, %%mm5\n"
983
      "movq %%mm1, %%mm3\n"
984
      "punpcklbw %%mm7,%%mm4\n"
985
      "punpcklbw %%mm7,%%mm1\n"
986
      "punpckhbw %%mm7,%%mm5\n"
987
      "punpckhbw %%mm7,%%mm3\n"
988
      "psubw %%mm1, %%mm4\n"
989
      "psubw %%mm3, %%mm5\n"
990
      "psubw %%mm4, %%mm0\n"
991
      "psubw %%mm5, %%mm2\n"
992
      "pxor %%mm3, %%mm3\n"
993
      "pxor %%mm1, %%mm1\n"
994
      "pcmpgtw %%mm0, %%mm3\n\t"
995
      "pcmpgtw %%mm2, %%mm1\n\t"
996
      "pxor %%mm3, %%mm0\n"
997
      "pxor %%mm1, %%mm2\n"
998
      "psubw %%mm3, %%mm0\n"
999
      "psubw %%mm1, %%mm2\n"
1000
      "paddw %%mm0, %%mm2\n"
1001
      "paddw %%mm2, %%mm6\n"
1002

    
1003
      "add %2,%0\n"
1004
      "1:\n"
1005

    
1006
      "movq (%0),%%mm0\n"
1007
      "movq %%mm0, %%mm1\n"
1008
      "psllq $8, %%mm0\n"
1009
      "psrlq $8, %%mm1\n"
1010
      "psrlq $8, %%mm0\n"
1011
      "movq %%mm0, %%mm2\n"
1012
      "movq %%mm1, %%mm3\n"
1013
      "punpcklbw %%mm7,%%mm0\n"
1014
      "punpcklbw %%mm7,%%mm1\n"
1015
      "punpckhbw %%mm7,%%mm2\n"
1016
      "punpckhbw %%mm7,%%mm3\n"
1017
      "psubw %%mm1, %%mm0\n"
1018
      "psubw %%mm3, %%mm2\n"
1019
      "psubw %%mm0, %%mm4\n"
1020
      "psubw %%mm2, %%mm5\n"
1021
      "pxor %%mm3, %%mm3\n"
1022
      "pxor %%mm1, %%mm1\n"
1023
      "pcmpgtw %%mm4, %%mm3\n\t"
1024
      "pcmpgtw %%mm5, %%mm1\n\t"
1025
      "pxor %%mm3, %%mm4\n"
1026
      "pxor %%mm1, %%mm5\n"
1027
      "psubw %%mm3, %%mm4\n"
1028
      "psubw %%mm1, %%mm5\n"
1029
      "paddw %%mm4, %%mm5\n"
1030
      "paddw %%mm5, %%mm6\n"
1031

    
1032
      "add %2,%0\n"
1033

    
1034
      "movq (%0),%%mm4\n"
1035
      "movq %%mm4, %%mm1\n"
1036
      "psllq $8, %%mm4\n"
1037
      "psrlq $8, %%mm1\n"
1038
      "psrlq $8, %%mm4\n"
1039
      "movq %%mm4, %%mm5\n"
1040
      "movq %%mm1, %%mm3\n"
1041
      "punpcklbw %%mm7,%%mm4\n"
1042
      "punpcklbw %%mm7,%%mm1\n"
1043
      "punpckhbw %%mm7,%%mm5\n"
1044
      "punpckhbw %%mm7,%%mm3\n"
1045
      "psubw %%mm1, %%mm4\n"
1046
      "psubw %%mm3, %%mm5\n"
1047
      "psubw %%mm4, %%mm0\n"
1048
      "psubw %%mm5, %%mm2\n"
1049
      "pxor %%mm3, %%mm3\n"
1050
      "pxor %%mm1, %%mm1\n"
1051
      "pcmpgtw %%mm0, %%mm3\n\t"
1052
      "pcmpgtw %%mm2, %%mm1\n\t"
1053
      "pxor %%mm3, %%mm0\n"
1054
      "pxor %%mm1, %%mm2\n"
1055
      "psubw %%mm3, %%mm0\n"
1056
      "psubw %%mm1, %%mm2\n"
1057
      "paddw %%mm0, %%mm2\n"
1058
      "paddw %%mm2, %%mm6\n"
1059

    
1060
      "add %2,%0\n"
1061
      "subl $2, %%ecx\n"
1062
      " jnz 1b\n"
1063

    
1064
      "movq %%mm6, %%mm0\n"
1065
      "punpcklwd %%mm7,%%mm0\n"
1066
      "punpckhwd %%mm7,%%mm6\n"
1067
      "paddd %%mm0, %%mm6\n"
1068

    
1069
      "movq %%mm6,%%mm0\n"
1070
      "psrlq $32, %%mm6\n"
1071
      "paddd %%mm6,%%mm0\n"
1072
      "movd %%mm0,%1\n"
1073
      : "+r" (pix1), "=r"(tmp)
1074
      : "r" ((long)line_size) , "g" (h-2)
1075
      : "%ecx");
1076
      return tmp;
1077
}
1078

    
1079
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1080
    int tmp;
1081
    uint8_t * pix= pix1;
1082
  asm volatile (
1083
      "movl %3,%%ecx\n"
1084
      "pxor %%mm7,%%mm7\n"
1085
      "pxor %%mm6,%%mm6\n"
1086

    
1087
      "movq (%0),%%mm0\n"
1088
      "movq 1(%0),%%mm1\n"
1089
      "movq %%mm0, %%mm2\n"
1090
      "movq %%mm1, %%mm3\n"
1091
      "punpcklbw %%mm7,%%mm0\n"
1092
      "punpcklbw %%mm7,%%mm1\n"
1093
      "punpckhbw %%mm7,%%mm2\n"
1094
      "punpckhbw %%mm7,%%mm3\n"
1095
      "psubw %%mm1, %%mm0\n"
1096
      "psubw %%mm3, %%mm2\n"
1097

    
1098
      "add %2,%0\n"
1099

    
1100
      "movq (%0),%%mm4\n"
1101
      "movq 1(%0),%%mm1\n"
1102
      "movq %%mm4, %%mm5\n"
1103
      "movq %%mm1, %%mm3\n"
1104
      "punpcklbw %%mm7,%%mm4\n"
1105
      "punpcklbw %%mm7,%%mm1\n"
1106
      "punpckhbw %%mm7,%%mm5\n"
1107
      "punpckhbw %%mm7,%%mm3\n"
1108
      "psubw %%mm1, %%mm4\n"
1109
      "psubw %%mm3, %%mm5\n"
1110
      "psubw %%mm4, %%mm0\n"
1111
      "psubw %%mm5, %%mm2\n"
1112
      "pxor %%mm3, %%mm3\n"
1113
      "pxor %%mm1, %%mm1\n"
1114
      "pcmpgtw %%mm0, %%mm3\n\t"
1115
      "pcmpgtw %%mm2, %%mm1\n\t"
1116
      "pxor %%mm3, %%mm0\n"
1117
      "pxor %%mm1, %%mm2\n"
1118
      "psubw %%mm3, %%mm0\n"
1119
      "psubw %%mm1, %%mm2\n"
1120
      "paddw %%mm0, %%mm2\n"
1121
      "paddw %%mm2, %%mm6\n"
1122

    
1123
      "add %2,%0\n"
1124
      "1:\n"
1125

    
1126
      "movq (%0),%%mm0\n"
1127
      "movq 1(%0),%%mm1\n"
1128
      "movq %%mm0, %%mm2\n"
1129
      "movq %%mm1, %%mm3\n"
1130
      "punpcklbw %%mm7,%%mm0\n"
1131
      "punpcklbw %%mm7,%%mm1\n"
1132
      "punpckhbw %%mm7,%%mm2\n"
1133
      "punpckhbw %%mm7,%%mm3\n"
1134
      "psubw %%mm1, %%mm0\n"
1135
      "psubw %%mm3, %%mm2\n"
1136
      "psubw %%mm0, %%mm4\n"
1137
      "psubw %%mm2, %%mm5\n"
1138
      "pxor %%mm3, %%mm3\n"
1139
      "pxor %%mm1, %%mm1\n"
1140
      "pcmpgtw %%mm4, %%mm3\n\t"
1141
      "pcmpgtw %%mm5, %%mm1\n\t"
1142
      "pxor %%mm3, %%mm4\n"
1143
      "pxor %%mm1, %%mm5\n"
1144
      "psubw %%mm3, %%mm4\n"
1145
      "psubw %%mm1, %%mm5\n"
1146
      "paddw %%mm4, %%mm5\n"
1147
      "paddw %%mm5, %%mm6\n"
1148

    
1149
      "add %2,%0\n"
1150

    
1151
      "movq (%0),%%mm4\n"
1152
      "movq 1(%0),%%mm1\n"
1153
      "movq %%mm4, %%mm5\n"
1154
      "movq %%mm1, %%mm3\n"
1155
      "punpcklbw %%mm7,%%mm4\n"
1156
      "punpcklbw %%mm7,%%mm1\n"
1157
      "punpckhbw %%mm7,%%mm5\n"
1158
      "punpckhbw %%mm7,%%mm3\n"
1159
      "psubw %%mm1, %%mm4\n"
1160
      "psubw %%mm3, %%mm5\n"
1161
      "psubw %%mm4, %%mm0\n"
1162
      "psubw %%mm5, %%mm2\n"
1163
      "pxor %%mm3, %%mm3\n"
1164
      "pxor %%mm1, %%mm1\n"
1165
      "pcmpgtw %%mm0, %%mm3\n\t"
1166
      "pcmpgtw %%mm2, %%mm1\n\t"
1167
      "pxor %%mm3, %%mm0\n"
1168
      "pxor %%mm1, %%mm2\n"
1169
      "psubw %%mm3, %%mm0\n"
1170
      "psubw %%mm1, %%mm2\n"
1171
      "paddw %%mm0, %%mm2\n"
1172
      "paddw %%mm2, %%mm6\n"
1173

    
1174
      "add %2,%0\n"
1175
      "subl $2, %%ecx\n"
1176
      " jnz 1b\n"
1177

    
1178
      "movq %%mm6, %%mm0\n"
1179
      "punpcklwd %%mm7,%%mm0\n"
1180
      "punpckhwd %%mm7,%%mm6\n"
1181
      "paddd %%mm0, %%mm6\n"
1182

    
1183
      "movq %%mm6,%%mm0\n"
1184
      "psrlq $32, %%mm6\n"
1185
      "paddd %%mm6,%%mm0\n"
1186
      "movd %%mm0,%1\n"
1187
      : "+r" (pix1), "=r"(tmp)
1188
      : "r" ((long)line_size) , "g" (h-2)
1189
      : "%ecx");
1190
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1191
}
1192

    
1193
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1194
    MpegEncContext *c = p;
1195
    int score1, score2;
1196

    
1197
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1198
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1199
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1200

    
1201
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1202
    else  return score1 + FFABS(score2)*8;
1203
}
1204

    
1205
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1206
    MpegEncContext *c = p;
1207
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1208
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1209

    
1210
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1211
    else  return score1 + FFABS(score2)*8;
1212
}
1213

    
1214
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1215
    int tmp;
1216

    
1217
    assert( (((int)pix) & 7) == 0);
1218
    assert((line_size &7) ==0);
1219

    
1220
#define SUM(in0, in1, out0, out1) \
1221
      "movq (%0), %%mm2\n"\
1222
      "movq 8(%0), %%mm3\n"\
1223
      "add %2,%0\n"\
1224
      "movq %%mm2, " #out0 "\n"\
1225
      "movq %%mm3, " #out1 "\n"\
1226
      "psubusb " #in0 ", %%mm2\n"\
1227
      "psubusb " #in1 ", %%mm3\n"\
1228
      "psubusb " #out0 ", " #in0 "\n"\
1229
      "psubusb " #out1 ", " #in1 "\n"\
1230
      "por %%mm2, " #in0 "\n"\
1231
      "por %%mm3, " #in1 "\n"\
1232
      "movq " #in0 ", %%mm2\n"\
1233
      "movq " #in1 ", %%mm3\n"\
1234
      "punpcklbw %%mm7, " #in0 "\n"\
1235
      "punpcklbw %%mm7, " #in1 "\n"\
1236
      "punpckhbw %%mm7, %%mm2\n"\
1237
      "punpckhbw %%mm7, %%mm3\n"\
1238
      "paddw " #in1 ", " #in0 "\n"\
1239
      "paddw %%mm3, %%mm2\n"\
1240
      "paddw %%mm2, " #in0 "\n"\
1241
      "paddw " #in0 ", %%mm6\n"
1242

    
1243

    
1244
  asm volatile (
1245
      "movl %3,%%ecx\n"
1246
      "pxor %%mm6,%%mm6\n"
1247
      "pxor %%mm7,%%mm7\n"
1248
      "movq (%0),%%mm0\n"
1249
      "movq 8(%0),%%mm1\n"
1250
      "add %2,%0\n"
1251
      "subl $2, %%ecx\n"
1252
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1253
      "1:\n"
1254

    
1255
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1256

    
1257
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1258

    
1259
      "subl $2, %%ecx\n"
1260
      "jnz 1b\n"
1261

    
1262
      "movq %%mm6,%%mm0\n"
1263
      "psrlq $32, %%mm6\n"
1264
      "paddw %%mm6,%%mm0\n"
1265
      "movq %%mm0,%%mm6\n"
1266
      "psrlq $16, %%mm0\n"
1267
      "paddw %%mm6,%%mm0\n"
1268
      "movd %%mm0,%1\n"
1269
      : "+r" (pix), "=r"(tmp)
1270
      : "r" ((long)line_size) , "m" (h)
1271
      : "%ecx");
1272
    return tmp & 0xFFFF;
1273
}
1274
#undef SUM
1275

    
1276
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1277
    int tmp;
1278

    
1279
    assert( (((int)pix) & 7) == 0);
1280
    assert((line_size &7) ==0);
1281

    
1282
#define SUM(in0, in1, out0, out1) \
1283
      "movq (%0), " #out0 "\n"\
1284
      "movq 8(%0), " #out1 "\n"\
1285
      "add %2,%0\n"\
1286
      "psadbw " #out0 ", " #in0 "\n"\
1287
      "psadbw " #out1 ", " #in1 "\n"\
1288
      "paddw " #in1 ", " #in0 "\n"\
1289
      "paddw " #in0 ", %%mm6\n"
1290

    
1291
  asm volatile (
1292
      "movl %3,%%ecx\n"
1293
      "pxor %%mm6,%%mm6\n"
1294
      "pxor %%mm7,%%mm7\n"
1295
      "movq (%0),%%mm0\n"
1296
      "movq 8(%0),%%mm1\n"
1297
      "add %2,%0\n"
1298
      "subl $2, %%ecx\n"
1299
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1300
      "1:\n"
1301

    
1302
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1303

    
1304
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1305

    
1306
      "subl $2, %%ecx\n"
1307
      "jnz 1b\n"
1308

    
1309
      "movd %%mm6,%1\n"
1310
      : "+r" (pix), "=r"(tmp)
1311
      : "r" ((long)line_size) , "m" (h)
1312
      : "%ecx");
1313
    return tmp;
1314
}
1315
#undef SUM
1316

    
1317
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1318
    int tmp;
1319

    
1320
    assert( (((int)pix1) & 7) == 0);
1321
    assert( (((int)pix2) & 7) == 0);
1322
    assert((line_size &7) ==0);
1323

    
1324
#define SUM(in0, in1, out0, out1) \
1325
      "movq (%0),%%mm2\n"\
1326
      "movq (%1)," #out0 "\n"\
1327
      "movq 8(%0),%%mm3\n"\
1328
      "movq 8(%1)," #out1 "\n"\
1329
      "add %3,%0\n"\
1330
      "add %3,%1\n"\
1331
      "psubb " #out0 ", %%mm2\n"\
1332
      "psubb " #out1 ", %%mm3\n"\
1333
      "pxor %%mm7, %%mm2\n"\
1334
      "pxor %%mm7, %%mm3\n"\
1335
      "movq %%mm2, " #out0 "\n"\
1336
      "movq %%mm3, " #out1 "\n"\
1337
      "psubusb " #in0 ", %%mm2\n"\
1338
      "psubusb " #in1 ", %%mm3\n"\
1339
      "psubusb " #out0 ", " #in0 "\n"\
1340
      "psubusb " #out1 ", " #in1 "\n"\
1341
      "por %%mm2, " #in0 "\n"\
1342
      "por %%mm3, " #in1 "\n"\
1343
      "movq " #in0 ", %%mm2\n"\
1344
      "movq " #in1 ", %%mm3\n"\
1345
      "punpcklbw %%mm7, " #in0 "\n"\
1346
      "punpcklbw %%mm7, " #in1 "\n"\
1347
      "punpckhbw %%mm7, %%mm2\n"\
1348
      "punpckhbw %%mm7, %%mm3\n"\
1349
      "paddw " #in1 ", " #in0 "\n"\
1350
      "paddw %%mm3, %%mm2\n"\
1351
      "paddw %%mm2, " #in0 "\n"\
1352
      "paddw " #in0 ", %%mm6\n"
1353

    
1354

    
1355
  asm volatile (
1356
      "movl %4,%%ecx\n"
1357
      "pxor %%mm6,%%mm6\n"
1358
      "pcmpeqw %%mm7,%%mm7\n"
1359
      "psllw $15, %%mm7\n"
1360
      "packsswb %%mm7, %%mm7\n"
1361
      "movq (%0),%%mm0\n"
1362
      "movq (%1),%%mm2\n"
1363
      "movq 8(%0),%%mm1\n"
1364
      "movq 8(%1),%%mm3\n"
1365
      "add %3,%0\n"
1366
      "add %3,%1\n"
1367
      "subl $2, %%ecx\n"
1368
      "psubb %%mm2, %%mm0\n"
1369
      "psubb %%mm3, %%mm1\n"
1370
      "pxor %%mm7, %%mm0\n"
1371
      "pxor %%mm7, %%mm1\n"
1372
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1373
      "1:\n"
1374

    
1375
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1376

    
1377
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1378

    
1379
      "subl $2, %%ecx\n"
1380
      "jnz 1b\n"
1381

    
1382
      "movq %%mm6,%%mm0\n"
1383
      "psrlq $32, %%mm6\n"
1384
      "paddw %%mm6,%%mm0\n"
1385
      "movq %%mm0,%%mm6\n"
1386
      "psrlq $16, %%mm0\n"
1387
      "paddw %%mm6,%%mm0\n"
1388
      "movd %%mm0,%2\n"
1389
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1390
      : "r" ((long)line_size) , "m" (h)
1391
      : "%ecx");
1392
    return tmp & 0x7FFF;
1393
}
1394
#undef SUM
1395

    
1396
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1397
    int tmp;
1398

    
1399
    assert( (((int)pix1) & 7) == 0);
1400
    assert( (((int)pix2) & 7) == 0);
1401
    assert((line_size &7) ==0);
1402

    
1403
#define SUM(in0, in1, out0, out1) \
1404
      "movq (%0)," #out0 "\n"\
1405
      "movq (%1),%%mm2\n"\
1406
      "movq 8(%0)," #out1 "\n"\
1407
      "movq 8(%1),%%mm3\n"\
1408
      "add %3,%0\n"\
1409
      "add %3,%1\n"\
1410
      "psubb %%mm2, " #out0 "\n"\
1411
      "psubb %%mm3, " #out1 "\n"\
1412
      "pxor %%mm7, " #out0 "\n"\
1413
      "pxor %%mm7, " #out1 "\n"\
1414
      "psadbw " #out0 ", " #in0 "\n"\
1415
      "psadbw " #out1 ", " #in1 "\n"\
1416
      "paddw " #in1 ", " #in0 "\n"\
1417
      "paddw " #in0 ", %%mm6\n"
1418

    
1419
  asm volatile (
1420
      "movl %4,%%ecx\n"
1421
      "pxor %%mm6,%%mm6\n"
1422
      "pcmpeqw %%mm7,%%mm7\n"
1423
      "psllw $15, %%mm7\n"
1424
      "packsswb %%mm7, %%mm7\n"
1425
      "movq (%0),%%mm0\n"
1426
      "movq (%1),%%mm2\n"
1427
      "movq 8(%0),%%mm1\n"
1428
      "movq 8(%1),%%mm3\n"
1429
      "add %3,%0\n"
1430
      "add %3,%1\n"
1431
      "subl $2, %%ecx\n"
1432
      "psubb %%mm2, %%mm0\n"
1433
      "psubb %%mm3, %%mm1\n"
1434
      "pxor %%mm7, %%mm0\n"
1435
      "pxor %%mm7, %%mm1\n"
1436
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1437
      "1:\n"
1438

    
1439
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1440

    
1441
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1442

    
1443
      "subl $2, %%ecx\n"
1444
      "jnz 1b\n"
1445

    
1446
      "movd %%mm6,%2\n"
1447
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1448
      : "r" ((long)line_size) , "m" (h)
1449
      : "%ecx");
1450
    return tmp;
1451
}
1452
#undef SUM
1453

    
1454
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1455
    long i=0;
1456
    asm volatile(
1457
        "1:                             \n\t"
1458
        "movq  (%2, %0), %%mm0          \n\t"
1459
        "movq  (%1, %0), %%mm1          \n\t"
1460
        "psubb %%mm0, %%mm1             \n\t"
1461
        "movq %%mm1, (%3, %0)           \n\t"
1462
        "movq 8(%2, %0), %%mm0          \n\t"
1463
        "movq 8(%1, %0), %%mm1          \n\t"
1464
        "psubb %%mm0, %%mm1             \n\t"
1465
        "movq %%mm1, 8(%3, %0)          \n\t"
1466
        "add $16, %0                    \n\t"
1467
        "cmp %4, %0                     \n\t"
1468
        " jb 1b                         \n\t"
1469
        : "+r" (i)
1470
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1471
    );
1472
    for(; i<w; i++)
1473
        dst[i+0] = src1[i+0]-src2[i+0];
1474
}
1475

    
1476
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1477
    long i=0;
1478
    uint8_t l, lt;
1479

    
1480
    asm volatile(
1481
        "1:                             \n\t"
1482
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1483
        "movq  (%1, %0), %%mm1          \n\t" // T
1484
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1485
        "movq  (%2, %0), %%mm3          \n\t" // X
1486
        "movq %%mm2, %%mm4              \n\t" // L
1487
        "psubb %%mm0, %%mm2             \n\t"
1488
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1489
        "movq %%mm4, %%mm5              \n\t" // L
1490
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1491
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1492
        "pminub %%mm2, %%mm4            \n\t"
1493
        "pmaxub %%mm1, %%mm4            \n\t"
1494
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1495
        "movq %%mm3, (%3, %0)           \n\t"
1496
        "add $8, %0                     \n\t"
1497
        "cmp %4, %0                     \n\t"
1498
        " jb 1b                         \n\t"
1499
        : "+r" (i)
1500
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1501
    );
1502

    
1503
    l= *left;
1504
    lt= *left_top;
1505

    
1506
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1507

    
1508
    *left_top= src1[w-1];
1509
    *left    = src2[w-1];
1510
}
1511

    
1512
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1513
    "mov"#m" "#p1", "#a"              \n\t"\
1514
    "mov"#m" "#p2", "#t"              \n\t"\
1515
    "punpcklbw "#a", "#t"             \n\t"\
1516
    "punpcklbw "#a", "#a"             \n\t"\
1517
    "psubw     "#t", "#a"             \n\t"\
1518

    
1519
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1520
    uint8_t *p1b=p1, *p2b=p2;\
1521
    asm volatile(\
1522
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1523
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1524
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1525
        "add %4, %1                   \n\t"\
1526
        "add %4, %2                   \n\t"\
1527
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1528
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1529
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1530
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1531
        "mov"#m1" "#mm"0, %0          \n\t"\
1532
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1533
        "mov"#m1" %0, "#mm"0          \n\t"\
1534
        : "=m"(temp), "+r"(p1b), "+r"(p2b)\
1535
        : "r"((long)stride), "r"((long)stride*3)\
1536
    );\
1537
}
1538

    
1539
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1540
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1541

    
1542
#ifdef ARCH_X86_64
1543
// permutes 01234567 -> 05736421
1544
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1545
    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1546
    SBUTTERFLY(c,d,b,wd,dqa)\
1547
    SBUTTERFLY(e,f,d,wd,dqa)\
1548
    SBUTTERFLY(g,h,f,wd,dqa)\
1549
    SBUTTERFLY(a,c,h,dq,dqa)\
1550
    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1551
    SBUTTERFLY(e,g,b,dq,dqa)\
1552
    SBUTTERFLY(d,f,g,dq,dqa)\
1553
    SBUTTERFLY(a,e,f,qdq,dqa)\
1554
    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1555
    SBUTTERFLY(h,b,d,qdq,dqa)\
1556
    SBUTTERFLY(c,g,b,qdq,dqa)\
1557
    "movdqa %%xmm8, "#g"              \n\t"
1558
#else
1559
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1560
    "movdqa "#h", "#t"                \n\t"\
1561
    SBUTTERFLY(a,b,h,wd,dqa)\
1562
    "movdqa "#h", 16"#t"              \n\t"\
1563
    "movdqa "#t", "#h"                \n\t"\
1564
    SBUTTERFLY(c,d,b,wd,dqa)\
1565
    SBUTTERFLY(e,f,d,wd,dqa)\
1566
    SBUTTERFLY(g,h,f,wd,dqa)\
1567
    SBUTTERFLY(a,c,h,dq,dqa)\
1568
    "movdqa "#h", "#t"                \n\t"\
1569
    "movdqa 16"#t", "#h"              \n\t"\
1570
    SBUTTERFLY(h,b,c,dq,dqa)\
1571
    SBUTTERFLY(e,g,b,dq,dqa)\
1572
    SBUTTERFLY(d,f,g,dq,dqa)\
1573
    SBUTTERFLY(a,e,f,qdq,dqa)\
1574
    SBUTTERFLY(h,d,e,qdq,dqa)\
1575
    "movdqa "#h", 16"#t"              \n\t"\
1576
    "movdqa "#t", "#h"                \n\t"\
1577
    SBUTTERFLY(h,b,d,qdq,dqa)\
1578
    SBUTTERFLY(c,g,b,qdq,dqa)\
1579
    "movdqa 16"#t", "#g"              \n\t"
1580
#endif
1581

    
1582
#define LBUTTERFLY2(a1,b1,a2,b2)\
1583
    "paddw " #b1 ", " #a1 "           \n\t"\
1584
    "paddw " #b2 ", " #a2 "           \n\t"\
1585
    "paddw " #b1 ", " #b1 "           \n\t"\
1586
    "paddw " #b2 ", " #b2 "           \n\t"\
1587
    "psubw " #a1 ", " #b1 "           \n\t"\
1588
    "psubw " #a2 ", " #b2 "           \n\t"
1589

    
1590
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1591
        LBUTTERFLY2(m0, m1, m2, m3)\
1592
        LBUTTERFLY2(m4, m5, m6, m7)\
1593
        LBUTTERFLY2(m0, m2, m1, m3)\
1594
        LBUTTERFLY2(m4, m6, m5, m7)\
1595
        LBUTTERFLY2(m0, m4, m1, m5)\
1596
        LBUTTERFLY2(m2, m6, m3, m7)\
1597

    
1598
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1599

    
1600
#define MMABS_MMX(a,z)\
1601
    "pxor " #z ", " #z "              \n\t"\
1602
    "pcmpgtw " #a ", " #z "           \n\t"\
1603
    "pxor " #z ", " #a "              \n\t"\
1604
    "psubw " #z ", " #a "             \n\t"
1605

    
1606
#define MMABS_MMX2(a,z)\
1607
    "pxor " #z ", " #z "              \n\t"\
1608
    "psubw " #a ", " #z "             \n\t"\
1609
    "pmaxsw " #z ", " #a "            \n\t"
1610

    
1611
#define MMABS_SSSE3(a,z)\
1612
    "pabsw " #a ", " #a "             \n\t"
1613

    
1614
#define MMABS_SUM(a,z, sum)\
1615
    MMABS(a,z)\
1616
    "paddusw " #a ", " #sum "         \n\t"
1617

    
1618
#define MMABS_SUM_8x8_NOSPILL\
1619
    MMABS(%%xmm0, %%xmm8)\
1620
    MMABS(%%xmm1, %%xmm9)\
1621
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1622
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1623
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1624
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1625
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1626
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1627
    "paddusw %%xmm1, %%xmm0           \n\t"
1628

    
1629
#ifdef ARCH_X86_64
1630
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1631
#else
1632
#define MMABS_SUM_8x8_SSE2\
1633
    "movdqa %%xmm7, (%1)              \n\t"\
1634
    MMABS(%%xmm0, %%xmm7)\
1635
    MMABS(%%xmm1, %%xmm7)\
1636
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1637
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1638
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1639
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1640
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1641
    "movdqa (%1), %%xmm2              \n\t"\
1642
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1643
    "paddusw %%xmm1, %%xmm0           \n\t"
1644
#endif
1645

    
1646
#define LOAD4(o, a, b, c, d)\
1647
    "movq "#o"(%1),    "#a"           \n\t"\
1648
    "movq "#o"+8(%1),  "#b"           \n\t"\
1649
    "movq "#o"+16(%1), "#c"           \n\t"\
1650
    "movq "#o"+24(%1), "#d"           \n\t"\
1651

    
1652
#define STORE4(o, a, b, c, d)\
1653
    "movq "#a", "#o"(%1)              \n\t"\
1654
    "movq "#b", "#o"+8(%1)            \n\t"\
1655
    "movq "#c", "#o"+16(%1)           \n\t"\
1656
    "movq "#d", "#o"+24(%1)           \n\t"\
1657

    
1658
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1659
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1660
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1661
#define HSUM_MMX(a, t, dst)\
1662
    "movq "#a", "#t"                  \n\t"\
1663
    "psrlq $32, "#a"                  \n\t"\
1664
    "paddusw "#t", "#a"               \n\t"\
1665
    "movq "#a", "#t"                  \n\t"\
1666
    "psrlq $16, "#a"                  \n\t"\
1667
    "paddusw "#t", "#a"               \n\t"\
1668
    "movd "#a", "#dst"                \n\t"\
1669

    
1670
#define HSUM_MMX2(a, t, dst)\
1671
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1672
    "paddusw "#t", "#a"               \n\t"\
1673
    "pshufw $0x01, "#a", "#t"         \n\t"\
1674
    "paddusw "#t", "#a"               \n\t"\
1675
    "movd "#a", "#dst"                \n\t"\
1676

    
1677
#define HSUM_SSE2(a, t, dst)\
1678
    "movhlps "#a", "#t"               \n\t"\
1679
    "paddusw "#t", "#a"               \n\t"\
1680
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1681
    "paddusw "#t", "#a"               \n\t"\
1682
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1683
    "paddusw "#t", "#a"               \n\t"\
1684
    "movd "#a", "#dst"                \n\t"\
1685

    
1686
#define HADAMARD8_DIFF_MMX(cpu) \
1687
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1688
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1689
    int sum;\
1690
\
1691
    assert(h==8);\
1692
\
1693
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1694
\
1695
    asm volatile(\
1696
        HADAMARD48\
1697
\
1698
        "movq %%mm7, 96(%1)             \n\t"\
1699
\
1700
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1701
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1702
\
1703
        "movq 96(%1), %%mm7             \n\t"\
1704
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1705
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1706
\
1707
        : "=r" (sum)\
1708
        : "r"(temp)\
1709
    );\
1710
\
1711
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1712
\
1713
    asm volatile(\
1714
        HADAMARD48\
1715
\
1716
        "movq %%mm7, 96(%1)             \n\t"\
1717
\
1718
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1719
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1720
\
1721
        "movq 96(%1), %%mm7             \n\t"\
1722
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1723
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1724
        "movq %%mm6, %%mm7              \n\t"\
1725
        "movq %%mm0, %%mm6              \n\t"\
1726
\
1727
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1728
\
1729
        HADAMARD48\
1730
        "movq %%mm7, 64(%1)             \n\t"\
1731
        MMABS(%%mm0, %%mm7)\
1732
        MMABS(%%mm1, %%mm7)\
1733
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1734
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1735
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1736
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1737
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1738
        "movq 64(%1), %%mm2             \n\t"\
1739
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1740
        "paddusw %%mm1, %%mm0           \n\t"\
1741
        "movq %%mm0, 64(%1)             \n\t"\
1742
\
1743
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1744
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1745
\
1746
        HADAMARD48\
1747
        "movq %%mm7, (%1)               \n\t"\
1748
        MMABS(%%mm0, %%mm7)\
1749
        MMABS(%%mm1, %%mm7)\
1750
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1751
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1752
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1753
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1754
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1755
        "movq (%1), %%mm2               \n\t"\
1756
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1757
        "paddusw 64(%1), %%mm0          \n\t"\
1758
        "paddusw %%mm1, %%mm0           \n\t"\
1759
\
1760
        HSUM(%%mm0, %%mm1, %0)\
1761
\
1762
        : "=r" (sum)\
1763
        : "r"(temp)\
1764
    );\
1765
    return sum&0xFFFF;\
1766
}\
1767
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1768

    
1769
#define HADAMARD8_DIFF_SSE2(cpu) \
1770
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1771
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1772
    int sum;\
1773
\
1774
    assert(h==8);\
1775
\
1776
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1777
\
1778
    asm volatile(\
1779
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1780
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1781
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1782
        MMABS_SUM_8x8\
1783
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1784
        : "=r" (sum)\
1785
        : "r"(temp)\
1786
    );\
1787
    return sum&0xFFFF;\
1788
}\
1789
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1790

    
1791
#define MMABS(a,z)         MMABS_MMX(a,z)
1792
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1793
HADAMARD8_DIFF_MMX(mmx)
1794
#undef MMABS
1795
#undef HSUM
1796

    
1797
#define MMABS(a,z)         MMABS_MMX2(a,z)
1798
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1799
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1800
HADAMARD8_DIFF_MMX(mmx2)
1801
HADAMARD8_DIFF_SSE2(sse2)
1802
#undef MMABS
1803
#undef MMABS_SUM_8x8
1804
#undef HSUM
1805

    
1806
#ifdef HAVE_SSSE3
1807
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1808
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1809
HADAMARD8_DIFF_SSE2(ssse3)
1810
#undef MMABS
1811
#undef MMABS_SUM_8x8
1812
#endif
1813

    
1814
#define DCT_SAD4(m,mm,o)\
1815
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1816
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1817
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1818
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1819
    MMABS_SUM(mm##2, mm##6, mm##0)\
1820
    MMABS_SUM(mm##3, mm##7, mm##1)\
1821
    MMABS_SUM(mm##4, mm##6, mm##0)\
1822
    MMABS_SUM(mm##5, mm##7, mm##1)\
1823

    
1824
#define DCT_SAD_MMX\
1825
    "pxor %%mm0, %%mm0                \n\t"\
1826
    "pxor %%mm1, %%mm1                \n\t"\
1827
    DCT_SAD4(q, %%mm, 0)\
1828
    DCT_SAD4(q, %%mm, 8)\
1829
    DCT_SAD4(q, %%mm, 64)\
1830
    DCT_SAD4(q, %%mm, 72)\
1831
    "paddusw %%mm1, %%mm0             \n\t"\
1832
    HSUM(%%mm0, %%mm1, %0)
1833

    
1834
#define DCT_SAD_SSE2\
1835
    "pxor %%xmm0, %%xmm0              \n\t"\
1836
    "pxor %%xmm1, %%xmm1              \n\t"\
1837
    DCT_SAD4(dqa, %%xmm, 0)\
1838
    DCT_SAD4(dqa, %%xmm, 64)\
1839
    "paddusw %%xmm1, %%xmm0           \n\t"\
1840
    HSUM(%%xmm0, %%xmm1, %0)
1841

    
1842
#define DCT_SAD_FUNC(cpu) \
1843
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1844
    int sum;\
1845
    asm volatile(\
1846
        DCT_SAD\
1847
        :"=r"(sum)\
1848
        :"r"(block)\
1849
    );\
1850
    return sum&0xFFFF;\
1851
}
1852

    
1853
#define DCT_SAD       DCT_SAD_MMX
1854
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1855
#define MMABS(a,z)    MMABS_MMX(a,z)
1856
DCT_SAD_FUNC(mmx)
1857
#undef MMABS
1858
#undef HSUM
1859

    
1860
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1861
#define MMABS(a,z)    MMABS_MMX2(a,z)
1862
DCT_SAD_FUNC(mmx2)
1863
#undef HSUM
1864
#undef DCT_SAD
1865

    
1866
#define DCT_SAD       DCT_SAD_SSE2
1867
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1868
DCT_SAD_FUNC(sse2)
1869
#undef MMABS
1870

    
1871
#ifdef HAVE_SSSE3
1872
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1873
DCT_SAD_FUNC(ssse3)
1874
#undef MMABS
1875
#endif
1876
#undef HSUM
1877
#undef DCT_SAD
1878

    
1879
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1880
    int sum;
1881
    long i=size;
1882
    asm volatile(
1883
        "pxor %%mm4, %%mm4 \n"
1884
        "1: \n"
1885
        "sub $8, %0 \n"
1886
        "movq (%2,%0), %%mm2 \n"
1887
        "movq (%3,%0,2), %%mm0 \n"
1888
        "movq 8(%3,%0,2), %%mm1 \n"
1889
        "punpckhbw %%mm2, %%mm3 \n"
1890
        "punpcklbw %%mm2, %%mm2 \n"
1891
        "psraw $8, %%mm3 \n"
1892
        "psraw $8, %%mm2 \n"
1893
        "psubw %%mm3, %%mm1 \n"
1894
        "psubw %%mm2, %%mm0 \n"
1895
        "pmaddwd %%mm1, %%mm1 \n"
1896
        "pmaddwd %%mm0, %%mm0 \n"
1897
        "paddd %%mm1, %%mm4 \n"
1898
        "paddd %%mm0, %%mm4 \n"
1899
        "jg 1b \n"
1900
        "movq %%mm4, %%mm3 \n"
1901
        "psrlq $32, %%mm3 \n"
1902
        "paddd %%mm3, %%mm4 \n"
1903
        "movd %%mm4, %1 \n"
1904
        :"+r"(i), "=r"(sum)
1905
        :"r"(pix1), "r"(pix2)
1906
    );
1907
    return sum;
1908
}
1909

    
1910
#endif //CONFIG_ENCODERS
1911

    
1912
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1913
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1914

    
1915
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1916
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1917
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1918
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1919
        "movq "#in7", " #m3 "             \n\t" /* d */\
1920
        "movq "#in0", %%mm5               \n\t" /* D */\
1921
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1922
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1923
        "movq "#in1", %%mm5               \n\t" /* C */\
1924
        "movq "#in2", %%mm6               \n\t" /* B */\
1925
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1926
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1927
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1928
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1929
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1930
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1931
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1932
        "psraw $5, %%mm5                  \n\t"\
1933
        "packuswb %%mm5, %%mm5            \n\t"\
1934
        OP(%%mm5, out, %%mm7, d)
1935

    
1936
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1937
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1938
    uint64_t temp;\
1939
\
1940
    asm volatile(\
1941
        "pxor %%mm7, %%mm7                \n\t"\
1942
        "1:                               \n\t"\
1943
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1944
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1945
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1946
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1947
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1948
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1949
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1950
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1951
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1952
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1953
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1954
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1955
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1956
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1957
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1958
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1959
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1960
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1961
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1962
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1963
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1964
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1965
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1966
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1967
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1968
        "paddw %6, %%mm6                  \n\t"\
1969
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1970
        "psraw $5, %%mm0                  \n\t"\
1971
        "movq %%mm0, %5                   \n\t"\
1972
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1973
        \
1974
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1975
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1976
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1977
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1978
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1979
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1980
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1981
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1982
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1983
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1984
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1985
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1986
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1987
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1988
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1989
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1990
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1991
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1992
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1993
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
1994
        "paddw %6, %%mm1                  \n\t"\
1995
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
1996
        "psraw $5, %%mm3                  \n\t"\
1997
        "movq %5, %%mm1                   \n\t"\
1998
        "packuswb %%mm3, %%mm1            \n\t"\
1999
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
2000
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2001
        \
2002
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
2003
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
2004
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
2005
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
2006
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
2007
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
2008
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
2009
        "paddw %%mm1, %%mm5               \n\t" /* b */\
2010
        "paddw %%mm4, %%mm0               \n\t" /* c */\
2011
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2012
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
2013
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
2014
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
2015
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
2016
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
2017
        "paddw %%mm3, %%mm2               \n\t" /* d */\
2018
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
2019
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
2020
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
2021
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
2022
        "paddw %%mm2, %%mm6               \n\t" /* a */\
2023
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2024
        "paddw %6, %%mm0                  \n\t"\
2025
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2026
        "psraw $5, %%mm0                  \n\t"\
2027
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2028
        \
2029
        "paddw %%mm5, %%mm3               \n\t" /* a */\
2030
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
2031
        "paddw %%mm4, %%mm6               \n\t" /* b */\
2032
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
2033
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
2034
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2035
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2036
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2037
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2038
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2039
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2040
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2041
        "paddw %6, %%mm4                  \n\t"\
2042
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2043
        "psraw $5, %%mm4                  \n\t"\
2044
        "packuswb %%mm4, %%mm0            \n\t"\
2045
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2046
        \
2047
        "add %3, %0                       \n\t"\
2048
        "add %4, %1                       \n\t"\
2049
        "decl %2                          \n\t"\
2050
        " jnz 1b                          \n\t"\
2051
        : "+a"(src), "+c"(dst), "+m"(h)\
2052
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2053
        : "memory"\
2054
    );\
2055
}\
2056
\
2057
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2058
    int i;\
2059
    int16_t temp[16];\
2060
    /* quick HACK, XXX FIXME MUST be optimized */\
2061
    for(i=0; i<h; i++)\
2062
    {\
2063
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2064
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2065
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2066
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2067
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2068
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2069
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2070
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2071
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2072
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2073
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2074
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2075
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2076
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2077
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2078
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2079
        asm volatile(\
2080
            "movq (%0), %%mm0               \n\t"\
2081
            "movq 8(%0), %%mm1              \n\t"\
2082
            "paddw %2, %%mm0                \n\t"\
2083
            "paddw %2, %%mm1                \n\t"\
2084
            "psraw $5, %%mm0                \n\t"\
2085
            "psraw $5, %%mm1                \n\t"\
2086
            "packuswb %%mm1, %%mm0          \n\t"\
2087
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2088
            "movq 16(%0), %%mm0             \n\t"\
2089
            "movq 24(%0), %%mm1             \n\t"\
2090
            "paddw %2, %%mm0                \n\t"\
2091
            "paddw %2, %%mm1                \n\t"\
2092
            "psraw $5, %%mm0                \n\t"\
2093
            "psraw $5, %%mm1                \n\t"\
2094
            "packuswb %%mm1, %%mm0          \n\t"\
2095
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2096
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2097
            : "memory"\
2098
        );\
2099
        dst+=dstStride;\
2100
        src+=srcStride;\
2101
    }\
2102
}\
2103
\
2104
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2105
    uint64_t temp;\
2106
\
2107
    asm volatile(\
2108
        "pxor %%mm7, %%mm7                \n\t"\
2109
        "1:                               \n\t"\
2110
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2111
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2112
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2113
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2114
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2115
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2116
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2117
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2118
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2119
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2120
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2121
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2122
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2123
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2124
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2125
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2126
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2127
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2128
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2129
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2130
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2131
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2132
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2133
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2134
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2135
        "paddw %6, %%mm6                  \n\t"\
2136
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2137
        "psraw $5, %%mm0                  \n\t"\
2138
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2139
        \
2140
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2141
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2142
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2143
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2144
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2145
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2146
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2147
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2148
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2149
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2150
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2151
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2152
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2153
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2154
        "paddw %6, %%mm1                  \n\t"\
2155
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2156
        "psraw $5, %%mm3                  \n\t"\
2157
        "packuswb %%mm3, %%mm0            \n\t"\
2158
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2159
        \
2160
        "add %3, %0                       \n\t"\
2161
        "add %4, %1                       \n\t"\
2162
        "decl %2                          \n\t"\
2163
        " jnz 1b                          \n\t"\
2164
        : "+a"(src), "+c"(dst), "+m"(h)\
2165
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2166
        : "memory"\
2167
    );\
2168
}\
2169
\
2170
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2171
    int i;\
2172
    int16_t temp[8];\
2173
    /* quick HACK, XXX FIXME MUST be optimized */\
2174
    for(i=0; i<h; i++)\
2175
    {\
2176
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2177
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2178
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2179
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2180
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2181
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2182
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2183
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2184
        asm volatile(\
2185
            "movq (%0), %%mm0           \n\t"\
2186
            "movq 8(%0), %%mm1          \n\t"\
2187
            "paddw %2, %%mm0            \n\t"\
2188
            "paddw %2, %%mm1            \n\t"\
2189
            "psraw $5, %%mm0            \n\t"\
2190
            "psraw $5, %%mm1            \n\t"\
2191
            "packuswb %%mm1, %%mm0      \n\t"\
2192
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2193
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2194
            :"memory"\
2195
        );\
2196
        dst+=dstStride;\
2197
        src+=srcStride;\
2198
    }\
2199
}
2200

    
2201
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2202
\
2203
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204
    uint64_t temp[17*4];\
2205
    uint64_t *temp_ptr= temp;\
2206
    int count= 17;\
2207
\
2208
    /*FIXME unroll */\
2209
    asm volatile(\
2210
        "pxor %%mm7, %%mm7              \n\t"\
2211
        "1:                             \n\t"\
2212
        "movq (%0), %%mm0               \n\t"\
2213
        "movq (%0), %%mm1               \n\t"\
2214
        "movq 8(%0), %%mm2              \n\t"\
2215
        "movq 8(%0), %%mm3              \n\t"\
2216
        "punpcklbw %%mm7, %%mm0         \n\t"\
2217
        "punpckhbw %%mm7, %%mm1         \n\t"\
2218
        "punpcklbw %%mm7, %%mm2         \n\t"\
2219
        "punpckhbw %%mm7, %%mm3         \n\t"\
2220
        "movq %%mm0, (%1)               \n\t"\
2221
        "movq %%mm1, 17*8(%1)           \n\t"\
2222
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2223
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2224
        "add $8, %1                     \n\t"\
2225
        "add %3, %0                     \n\t"\
2226
        "decl %2                        \n\t"\
2227
        " jnz 1b                        \n\t"\
2228
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2229
        : "r" ((long)srcStride)\
2230
        : "memory"\
2231
    );\
2232
    \
2233
    temp_ptr= temp;\
2234
    count=4;\
2235
    \
2236
/*FIXME reorder for speed */\
2237
    asm volatile(\
2238
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2239
        "1:                             \n\t"\
2240
        "movq (%0), %%mm0               \n\t"\
2241
        "movq 8(%0), %%mm1              \n\t"\
2242
        "movq 16(%0), %%mm2             \n\t"\
2243
        "movq 24(%0), %%mm3             \n\t"\
2244
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2245
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2246
        "add %4, %1                     \n\t"\
2247
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2248
        \
2249
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2250
        "add %4, %1                     \n\t"\
2251
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2252
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2253
        "add %4, %1                     \n\t"\
2254
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2255
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2256
        "add %4, %1                     \n\t"\
2257
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2258
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2259
        "add %4, %1                     \n\t"\
2260
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2261
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2262
        "add %4, %1                     \n\t"\
2263
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2264
        \
2265
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2266
        "add %4, %1                     \n\t"  \
2267
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2268
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2269
        \
2270
        "add $136, %0                   \n\t"\
2271
        "add %6, %1                     \n\t"\
2272
        "decl %2                        \n\t"\
2273
        " jnz 1b                        \n\t"\
2274
        \
2275
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2276
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2277
        :"memory"\
2278
    );\
2279
}\
2280
\
2281
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282
    uint64_t temp[9*2];\
2283
    uint64_t *temp_ptr= temp;\
2284
    int count= 9;\
2285
\
2286
    /*FIXME unroll */\
2287
    asm volatile(\
2288
        "pxor %%mm7, %%mm7              \n\t"\
2289
        "1:                             \n\t"\
2290
        "movq (%0), %%mm0               \n\t"\
2291
        "movq (%0), %%mm1               \n\t"\
2292
        "punpcklbw %%mm7, %%mm0         \n\t"\
2293
        "punpckhbw %%mm7, %%mm1         \n\t"\
2294
        "movq %%mm0, (%1)               \n\t"\
2295
        "movq %%mm1, 9*8(%1)            \n\t"\
2296
        "add $8, %1                     \n\t"\
2297
        "add %3, %0                     \n\t"\
2298
        "decl %2                        \n\t"\
2299
        " jnz 1b                        \n\t"\
2300
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2301
        : "r" ((long)srcStride)\
2302
        : "memory"\
2303
    );\
2304
    \
2305
    temp_ptr= temp;\
2306
    count=2;\
2307
    \
2308
/*FIXME reorder for speed */\
2309
    asm volatile(\
2310
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2311
        "1:                             \n\t"\
2312
        "movq (%0), %%mm0               \n\t"\
2313
        "movq 8(%0), %%mm1              \n\t"\
2314
        "movq 16(%0), %%mm2             \n\t"\
2315
        "movq 24(%0), %%mm3             \n\t"\
2316
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2317
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2318
        "add %4, %1                     \n\t"\
2319
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2320
        \
2321
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2322
        "add %4, %1                     \n\t"\
2323
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2324
        \
2325
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2326
        "add %4, %1                     \n\t"\
2327
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2328
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2329
                \
2330
        "add $72, %0                    \n\t"\
2331
        "add %6, %1                     \n\t"\
2332
        "decl %2                        \n\t"\
2333
        " jnz 1b                        \n\t"\
2334
         \
2335
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2336
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2337
        : "memory"\
2338
   );\
2339
}\
2340
\
2341
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2342
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2343
}\
2344
\
2345
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2346
    uint64_t temp[8];\
2347
    uint8_t * const half= (uint8_t*)temp;\
2348
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2349
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2350
}\
2351
\
2352
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2353
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2354
}\
2355
\
2356
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2357
    uint64_t temp[8];\
2358
    uint8_t * const half= (uint8_t*)temp;\
2359
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2360
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2361
}\
2362
\
2363
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2364
    uint64_t temp[8];\
2365
    uint8_t * const half= (uint8_t*)temp;\
2366
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2367
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2368
}\
2369
\
2370
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2371
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2372
}\
2373
\
2374
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2375
    uint64_t temp[8];\
2376
    uint8_t * const half= (uint8_t*)temp;\
2377
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2378
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2379
}\
2380
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2381
    uint64_t half[8 + 9];\
2382
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2383
    uint8_t * const halfHV= ((uint8_t*)half);\
2384
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2385
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2386
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2387
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2388
}\
2389
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2390
    uint64_t half[8 + 9];\
2391
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2392
    uint8_t * const halfHV= ((uint8_t*)half);\
2393
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2394
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2395
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2396
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2397
}\
2398
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2399
    uint64_t half[8 + 9];\
2400
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2401
    uint8_t * const halfHV= ((uint8_t*)half);\
2402
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2403
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2404
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2405
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2406
}\
2407
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2408
    uint64_t half[8 + 9];\
2409
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2410
    uint8_t * const halfHV= ((uint8_t*)half);\
2411
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2412
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2413
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2414
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2415
}\
2416
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2417
    uint64_t half[8 + 9];\
2418
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2419
    uint8_t * const halfHV= ((uint8_t*)half);\
2420
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2421
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2422
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2423
}\
2424
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2425
    uint64_t half[8 + 9];\
2426
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2427
    uint8_t * const halfHV= ((uint8_t*)half);\
2428
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2429
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2430
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2431
}\
2432
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2433
    uint64_t half[8 + 9];\
2434
    uint8_t * const halfH= ((uint8_t*)half);\
2435
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2436
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2437
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2438
}\
2439
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2440
    uint64_t half[8 + 9];\
2441
    uint8_t * const halfH= ((uint8_t*)half);\
2442
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2443
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2444
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2445
}\
2446
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2447
    uint64_t half[9];\
2448
    uint8_t * const halfH= ((uint8_t*)half);\
2449
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2450
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2451
}\
2452
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2453
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2454
}\
2455
\
2456
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2457
    uint64_t temp[32];\
2458
    uint8_t * const half= (uint8_t*)temp;\
2459
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2460
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2461
}\
2462
\
2463
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2464
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2465
}\
2466
\
2467
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2468
    uint64_t temp[32];\
2469
    uint8_t * const half= (uint8_t*)temp;\
2470
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2471
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2472
}\
2473
\
2474
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2475
    uint64_t temp[32];\
2476
    uint8_t * const half= (uint8_t*)temp;\
2477
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2478
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2479
}\
2480
\
2481
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2482
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2483
}\
2484
\
2485
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2486
    uint64_t temp[32];\
2487
    uint8_t * const half= (uint8_t*)temp;\
2488
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2489
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2490
}\
2491
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2492
    uint64_t half[16*2 + 17*2];\
2493
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2494
    uint8_t * const halfHV= ((uint8_t*)half);\
2495
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2496
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2497
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2498
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2499
}\
2500
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2501
    uint64_t half[16*2 + 17*2];\
2502
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2503
    uint8_t * const halfHV= ((uint8_t*)half);\
2504
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2505
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2506
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2507
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2508
}\
2509
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2510
    uint64_t half[16*2 + 17*2];\
2511
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2512
    uint8_t * const halfHV= ((uint8_t*)half);\
2513
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2514
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2515
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2516
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2517
}\
2518
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2519
    uint64_t half[16*2 + 17*2];\
2520
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2521
    uint8_t * const halfHV= ((uint8_t*)half);\
2522
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2523
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2524
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2525
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2526
}\
2527
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2528
    uint64_t half[16*2 + 17*2];\
2529
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2530
    uint8_t * const halfHV= ((uint8_t*)half);\
2531
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2532
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2533
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2534
}\
2535
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2536
    uint64_t half[16*2 + 17*2];\
2537
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2538
    uint8_t * const halfHV= ((uint8_t*)half);\
2539
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2540
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2541
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2542
}\
2543
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2544
    uint64_t half[17*2];\
2545
    uint8_t * const halfH= ((uint8_t*)half);\
2546
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2547
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2548
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2549
}\
2550
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2551
    uint64_t half[17*2];\
2552
    uint8_t * const halfH= ((uint8_t*)half);\
2553
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2554
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2555
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2556
}\
2557
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2558
    uint64_t half[17*2];\
2559
    uint8_t * const halfH= ((uint8_t*)half);\
2560
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2561
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2562
}
2563

    
2564
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2565
#define AVG_3DNOW_OP(a,b,temp, size) \
2566
"mov" #size " " #b ", " #temp "   \n\t"\
2567
"pavgusb " #temp ", " #a "        \n\t"\
2568
"mov" #size " " #a ", " #b "      \n\t"
2569
#define AVG_MMX2_OP(a,b,temp, size) \
2570
"mov" #size " " #b ", " #temp "   \n\t"\
2571
"pavgb " #temp ", " #a "          \n\t"\
2572
"mov" #size " " #a ", " #b "      \n\t"
2573

    
2574
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2575
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2576
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2577
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2578
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2579
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2580
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2581
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2582
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2583

    
2584
/***********************************/
2585
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2586

    
2587
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2588
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2589
    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2590
}
2591
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2592
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2593
    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2594
}
2595

    
2596
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
2597
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2598
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2599
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2600
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2601
                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2602
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2603
                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2604
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2605
                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2606
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2607
    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2608
}\
2609
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2610
    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2611
}\
2612
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
2613
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
2614
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
2615
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
2616
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
2617
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
2618
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
2619
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2620

    
2621
QPEL_2TAP(put_, 16, mmx2)
2622
QPEL_2TAP(avg_, 16, mmx2)
2623
QPEL_2TAP(put_,  8, mmx2)
2624
QPEL_2TAP(avg_,  8, mmx2)
2625
QPEL_2TAP(put_, 16, 3dnow)
2626
QPEL_2TAP(avg_, 16, 3dnow)
2627
QPEL_2TAP(put_,  8, 3dnow)
2628
QPEL_2TAP(avg_,  8, 3dnow)
2629

    
2630

    
2631
#if 0
2632
static void just_return() { return; }
2633
#endif
2634

    
2635
#define SET_QPEL_FUNC(postfix1, postfix2) \
2636
    c->put_ ## postfix1 = put_ ## postfix2;\
2637
    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2638
    c->avg_ ## postfix1 = avg_ ## postfix2;
2639

    
2640
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2641
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2642
    const int w = 8;
2643
    const int ix = ox>>(16+shift);
2644
    const int iy = oy>>(16+shift);
2645
    const int oxs = ox>>4;
2646
    const int oys = oy>>4;
2647
    const int dxxs = dxx>>4;
2648
    const int dxys = dxy>>4;
2649
    const int dyxs = dyx>>4;
2650
    const int dyys = dyy>>4;
2651
    const uint16_t r4[4] = {r,r,r,r};
2652
    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2653
    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2654
    const uint64_t shift2 = 2*shift;
2655
    uint8_t edge_buf[(h+1)*stride];
2656
    int x, y;
2657

    
2658
    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2659
    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2660
    const int dxh = dxy*(h-1);
2661
    const int dyw = dyx*(w-1);
2662
    if( // non-constant fullpel offset (3% of blocks)
2663
        (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2664
         oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2665
        // uses more than 16 bits of subpel mv (only at huge resolution)
2666
        || (dxx|dxy|dyx|dyy)&15 )
2667
    {
2668
        //FIXME could still use mmx for some of the rows
2669
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2670
        return;
2671
    }
2672

    
2673
    src += ix + iy*stride;
2674
    if( (unsigned)ix >= width-w ||
2675
        (unsigned)iy >= height-h )
2676
    {
2677
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2678
        src = edge_buf;
2679
    }
2680

    
2681
    asm volatile(
2682
        "movd         %0, %%mm6 \n\t"
2683
        "pxor      %%mm7, %%mm7 \n\t"
2684
        "punpcklwd %%mm6, %%mm6 \n\t"
2685
        "punpcklwd %%mm6, %%mm6 \n\t"
2686
        :: "r"(1<<shift)
2687
    );
2688

    
2689
    for(x=0; x<w; x+=4){
2690
        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2691
                            oxs - dxys + dxxs*(x+1),
2692
                            oxs - dxys + dxxs*(x+2),
2693
                            oxs - dxys + dxxs*(x+3) };
2694
        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2695
                            oys - dyys + dyxs*(x+1),
2696
                            oys - dyys + dyxs*(x+2),
2697
                            oys - dyys + dyxs*(x+3) };
2698

    
2699
        for(y=0; y<h; y++){
2700
            asm volatile(
2701
                "movq   %0,  %%mm4 \n\t"
2702
                "movq   %1,  %%mm5 \n\t"
2703
                "paddw  %2,  %%mm4 \n\t"
2704
                "paddw  %3,  %%mm5 \n\t"
2705
                "movq   %%mm4, %0  \n\t"
2706
                "movq   %%mm5, %1  \n\t"
2707
                "psrlw  $12, %%mm4 \n\t"
2708
                "psrlw  $12, %%mm5 \n\t"
2709
                : "+m"(*dx4), "+m"(*dy4)
2710
                : "m"(*dxy4), "m"(*dyy4)
2711
            );
2712

    
2713
            asm volatile(
2714
                "movq   %%mm6, %%mm2 \n\t"
2715
                "movq   %%mm6, %%mm1 \n\t"
2716
                "psubw  %%mm4, %%mm2 \n\t"
2717
                "psubw  %%mm5, %%mm1 \n\t"
2718
                "movq   %%mm2, %%mm0 \n\t"
2719
                "movq   %%mm4, %%mm3 \n\t"
2720
                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2721
                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2722
                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2723
                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2724

    
2725
                "movd   %4,    %%mm5 \n\t"
2726
                "movd   %3,    %%mm4 \n\t"
2727
                "punpcklbw %%mm7, %%mm5 \n\t"
2728
                "punpcklbw %%mm7, %%mm4 \n\t"
2729
                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2730
                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2731

    
2732
                "movd   %2,    %%mm5 \n\t"
2733
                "movd   %1,    %%mm4 \n\t"
2734
                "punpcklbw %%mm7, %%mm5 \n\t"
2735
                "punpcklbw %%mm7, %%mm4 \n\t"
2736
                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2737
                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2738
                "paddw  %5,    %%mm1 \n\t"
2739
                "paddw  %%mm3, %%mm2 \n\t"
2740
                "paddw  %%mm1, %%mm0 \n\t"
2741
                "paddw  %%mm2, %%mm0 \n\t"
2742

    
2743
                "psrlw    %6,    %%mm0 \n\t"
2744
                "packuswb %%mm0, %%mm0 \n\t"
2745
                "movd     %%mm0, %0    \n\t"
2746

    
2747
                : "=m"(dst[x+y*stride])
2748
                : "m"(src[0]), "m"(src[1]),
2749
                  "m"(src[stride]), "m"(src[stride+1]),
2750
                  "m"(*r4), "m"(shift2)
2751
            );
2752
            src += stride;
2753
        }
2754
        src += 4-h*stride;
2755
    }
2756
}
2757

    
2758
#ifdef CONFIG_ENCODERS
2759

    
2760
#define PHADDD(a, t)\
2761
    "movq "#a", "#t"                  \n\t"\
2762
    "psrlq $32, "#a"                  \n\t"\
2763
    "paddd "#t", "#a"                 \n\t"
2764
/*
2765
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2766
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2767
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2768
 */
2769
#define PMULHRW(x, y, s, o)\
2770
    "pmulhw " #s ", "#x "            \n\t"\
2771
    "pmulhw " #s ", "#y "            \n\t"\
2772
    "paddw " #o ", "#x "             \n\t"\
2773
    "paddw " #o ", "#y "             \n\t"\
2774
    "psraw $1, "#x "                 \n\t"\
2775
    "psraw $1, "#y "                 \n\t"
2776
#define DEF(x) x ## _mmx
2777
#define SET_RND MOVQ_WONE
2778
#define SCALE_OFFSET 1
2779

    
2780
#include "dsputil_mmx_qns.h"
2781

    
2782
#undef DEF
2783
#undef SET_RND
2784
#undef SCALE_OFFSET
2785
#undef PMULHRW
2786

    
2787
#define DEF(x) x ## _3dnow
2788
#define SET_RND(x)
2789
#define SCALE_OFFSET 0
2790
#define PMULHRW(x, y, s, o)\
2791
    "pmulhrw " #s ", "#x "           \n\t"\
2792
    "pmulhrw " #s ", "#y "           \n\t"
2793

    
2794
#include "dsputil_mmx_qns.h"
2795

    
2796
#undef DEF
2797
#undef SET_RND
2798
#undef SCALE_OFFSET
2799
#undef PMULHRW
2800

    
2801
#ifdef HAVE_SSSE3
2802
#undef PHADDD
2803
#define DEF(x) x ## _ssse3
2804
#define SET_RND(x)
2805
#define SCALE_OFFSET -1
2806
#define PHADDD(a, t)\
2807
    "pshufw $0x0E, "#a", "#t"         \n\t"\
2808
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
2809
#define PMULHRW(x, y, s, o)\
2810
    "pmulhrsw " #s ", "#x "          \n\t"\
2811
    "pmulhrsw " #s ", "#y "          \n\t"
2812

    
2813
#include "dsputil_mmx_qns.h"
2814

    
2815
#undef DEF
2816
#undef SET_RND
2817
#undef SCALE_OFFSET
2818
#undef PMULHRW
2819
#undef PHADDD
2820
#endif //HAVE_SSSE3
2821

    
2822
#endif /* CONFIG_ENCODERS */
2823

    
2824
#define PREFETCH(name, op) \
2825
static void name(void *mem, int stride, int h){\
2826
    const uint8_t *p= mem;\
2827
    do{\
2828
        asm volatile(#op" %0" :: "m"(*p));\
2829
        p+= stride;\
2830
    }while(--h);\
2831
}
2832
PREFETCH(prefetch_mmx2,  prefetcht0)
2833
PREFETCH(prefetch_3dnow, prefetch)
2834
#undef PREFETCH
2835

    
2836
#include "h264dsp_mmx.c"
2837

    
2838
/* AVS specific */
2839
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2840

    
2841
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2842
    put_pixels8_mmx(dst, src, stride, 8);
2843
}
2844
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2845
    avg_pixels8_mmx(dst, src, stride, 8);
2846
}
2847
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2848
    put_pixels16_mmx(dst, src, stride, 16);
2849
}
2850
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2851
    avg_pixels16_mmx(dst, src, stride, 16);
2852
}
2853

    
2854
/* external functions, from idct_mmx.c */
2855
void ff_mmx_idct(DCTELEM *block);
2856
void ff_mmxext_idct(DCTELEM *block);
2857

    
2858
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2859
   converted */
2860
#ifdef CONFIG_GPL
2861
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2862
{
2863
    ff_mmx_idct (block);
2864
    put_pixels_clamped_mmx(block, dest, line_size);
2865
}
2866
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2867
{
2868
    ff_mmx_idct (block);
2869
    add_pixels_clamped_mmx(block, dest, line_size);
2870
}
2871
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2872
{
2873
    ff_mmxext_idct (block);
2874
    put_pixels_clamped_mmx(block, dest, line_size);
2875
}
2876
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2877
{
2878
    ff_mmxext_idct (block);
2879
    add_pixels_clamped_mmx(block, dest, line_size);
2880
}
2881
#endif
2882
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2883
{
2884
    ff_idct_xvid_mmx (block);
2885
    put_pixels_clamped_mmx(block, dest, line_size);
2886
}
2887
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2888
{
2889
    ff_idct_xvid_mmx (block);
2890
    add_pixels_clamped_mmx(block, dest, line_size);
2891
}
2892
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2893
{
2894
    ff_idct_xvid_mmx2 (block);
2895
    put_pixels_clamped_mmx(block, dest, line_size);
2896
}
2897
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2898
{
2899
    ff_idct_xvid_mmx2 (block);
2900
    add_pixels_clamped_mmx(block, dest, line_size);
2901
}
2902

    
2903
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2904
{
2905
    int i;
2906
    asm volatile("pxor %%mm7, %%mm7":);
2907
    for(i=0; i<blocksize; i+=2) {
2908
        asm volatile(
2909
            "movq    %0,    %%mm0 \n\t"
2910
            "movq    %1,    %%mm1 \n\t"
2911
            "movq    %%mm0, %%mm2 \n\t"
2912
            "movq    %%mm1, %%mm3 \n\t"
2913
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2914
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2915
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2916
            "pxor    %%mm2, %%mm1 \n\t"
2917
            "movq    %%mm3, %%mm4 \n\t"
2918
            "pand    %%mm1, %%mm3 \n\t"
2919
            "pandn   %%mm1, %%mm4 \n\t"
2920
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2921
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2922
            "movq    %%mm3, %1    \n\t"
2923
            "movq    %%mm0, %0    \n\t"
2924
            :"+m"(mag[i]), "+m"(ang[i])
2925
            ::"memory"
2926
        );
2927
    }
2928
    asm volatile("femms");
2929
}
2930
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2931
{
2932
    int i;
2933

    
2934
    asm volatile(
2935
            "movaps  %0,     %%xmm5 \n\t"
2936
        ::"m"(ff_pdw_80000000[0])
2937
    );
2938
    for(i=0; i<blocksize; i+=4) {
2939
        asm volatile(
2940
            "movaps  %0,     %%xmm0 \n\t"
2941
            "movaps  %1,     %%xmm1 \n\t"
2942
            "xorps   %%xmm2, %%xmm2 \n\t"
2943
            "xorps   %%xmm3, %%xmm3 \n\t"
2944
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2945
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2946
            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2947
            "xorps   %%xmm2, %%xmm1 \n\t"
2948
            "movaps  %%xmm3, %%xmm4 \n\t"
2949
            "andps   %%xmm1, %%xmm3 \n\t"
2950
            "andnps  %%xmm1, %%xmm4 \n\t"
2951
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2952
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2953
            "movaps  %%xmm3, %1     \n\t"
2954
            "movaps  %%xmm0, %0     \n\t"
2955
            :"+m"(mag[i]), "+m"(ang[i])
2956
            ::"memory"
2957
        );
2958
    }
2959
}
2960

    
2961
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2962
    long i = (len-4)*4;
2963
    asm volatile(
2964
        "1: \n\t"
2965
        "movq    (%1,%0), %%mm0 \n\t"
2966
        "movq   8(%1,%0), %%mm1 \n\t"
2967
        "pfmul   (%2,%0), %%mm0 \n\t"
2968
        "pfmul  8(%2,%0), %%mm1 \n\t"
2969
        "movq   %%mm0,  (%1,%0) \n\t"
2970
        "movq   %%mm1, 8(%1,%0) \n\t"
2971
        "sub  $16, %0 \n\t"
2972
        "jge 1b \n\t"
2973
        "femms  \n\t"
2974
        :"+r"(i)
2975
        :"r"(dst), "r"(src)
2976
        :"memory"
2977
    );
2978
}
2979
static void vector_fmul_sse(float *dst, const float *src, int len){
2980
    long i = (len-8)*4;
2981
    asm volatile(
2982
        "1: \n\t"
2983
        "movaps    (%1,%0), %%xmm0 \n\t"
2984
        "movaps  16(%1,%0), %%xmm1 \n\t"
2985
        "mulps     (%2,%0), %%xmm0 \n\t"
2986
        "mulps   16(%2,%0), %%xmm1 \n\t"
2987
        "movaps  %%xmm0,   (%1,%0) \n\t"
2988
        "movaps  %%xmm1, 16(%1,%0) \n\t"
2989
        "sub  $32, %0 \n\t"
2990
        "jge 1b \n\t"
2991
        :"+r"(i)
2992
        :"r"(dst), "r"(src)
2993
        :"memory"
2994
    );
2995
}
2996

    
2997
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2998
    long i = len*4-16;
2999
    asm volatile(
3000
        "1: \n\t"
3001
        "pswapd   8(%1), %%mm0 \n\t"
3002
        "pswapd    (%1), %%mm1 \n\t"
3003
        "pfmul  (%3,%0), %%mm0 \n\t"
3004
        "pfmul 8(%3,%0), %%mm1 \n\t"
3005
        "movq  %%mm0,  (%2,%0) \n\t"
3006
        "movq  %%mm1, 8(%2,%0) \n\t"
3007
        "add   $16, %1 \n\t"
3008
        "sub   $16, %0 \n\t"
3009
        "jge   1b \n\t"
3010
        :"+r"(i), "+r"(src1)
3011
        :"r"(dst), "r"(src0)
3012
    );
3013
    asm volatile("femms");
3014
}
3015
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3016
    long i = len*4-32;
3017
    asm volatile(
3018
        "1: \n\t"
3019
        "movaps        16(%1), %%xmm0 \n\t"
3020
        "movaps          (%1), %%xmm1 \n\t"
3021
        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3022
        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3023
        "mulps        (%3,%0), %%xmm0 \n\t"
3024
        "mulps      16(%3,%0), %%xmm1 \n\t"
3025
        "movaps     %%xmm0,   (%2,%0) \n\t"
3026
        "movaps     %%xmm1, 16(%2,%0) \n\t"
3027
        "add    $32, %1 \n\t"
3028
        "sub    $32, %0 \n\t"
3029
        "jge    1b \n\t"
3030
        :"+r"(i), "+r"(src1)
3031
        :"r"(dst), "r"(src0)
3032
    );
3033
}
3034

    
3035
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3036
                                      const float *src2, int src3, int len, int step){
3037
    long i = (len-4)*4;
3038
    if(step == 2 && src3 == 0){
3039
        dst += (len-4)*2;
3040
        asm volatile(
3041
            "1: \n\t"
3042
            "movq   (%2,%0),  %%mm0 \n\t"
3043
            "movq  8(%2,%0),  %%mm1 \n\t"
3044
            "pfmul  (%3,%0),  %%mm0 \n\t"
3045
            "pfmul 8(%3,%0),  %%mm1 \n\t"
3046
            "pfadd  (%4,%0),  %%mm0 \n\t"
3047
            "pfadd 8(%4,%0),  %%mm1 \n\t"
3048
            "movd     %%mm0,   (%1) \n\t"
3049
            "movd     %%mm1, 16(%1) \n\t"
3050
            "psrlq      $32,  %%mm0 \n\t"
3051
            "psrlq      $32,  %%mm1 \n\t"
3052
            "movd     %%mm0,  8(%1) \n\t"
3053
            "movd     %%mm1, 24(%1) \n\t"
3054
            "sub  $32, %1 \n\t"
3055
            "sub  $16, %0 \n\t"
3056
            "jge  1b \n\t"
3057
            :"+r"(i), "+r"(dst)
3058
            :"r"(src0), "r"(src1), "r"(src2)
3059
            :"memory"
3060
        );
3061
    }
3062
    else if(step == 1 && src3 == 0){
3063
        asm volatile(
3064
            "1: \n\t"
3065
            "movq    (%2,%0), %%mm0 \n\t"
3066
            "movq   8(%2,%0), %%mm1 \n\t"
3067
            "pfmul   (%3,%0), %%mm0 \n\t"
3068
            "pfmul  8(%3,%0), %%mm1 \n\t"
3069
            "pfadd   (%4,%0), %%mm0 \n\t"
3070
            "pfadd  8(%4,%0), %%mm1 \n\t"
3071
            "movq  %%mm0,   (%1,%0) \n\t"
3072
            "movq  %%mm1,  8(%1,%0) \n\t"
3073
            "sub  $16, %0 \n\t"
3074
            "jge  1b \n\t"
3075
            :"+r"(i)
3076
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3077
            :"memory"
3078
        );
3079
    }
3080
    else
3081
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3082
    asm volatile("femms");
3083
}
3084
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3085
                                    const float *src2, int src3, int len, int step){
3086
    long i = (len-8)*4;
3087
    if(step == 2 && src3 == 0){
3088
        dst += (len-8)*2;
3089
        asm volatile(
3090
            "1: \n\t"
3091
            "movaps   (%2,%0), %%xmm0 \n\t"
3092
            "movaps 16(%2,%0), %%xmm1 \n\t"
3093
            "mulps    (%3,%0), %%xmm0 \n\t"
3094
            "mulps  16(%3,%0), %%xmm1 \n\t"
3095
            "addps    (%4,%0), %%xmm0 \n\t"
3096
            "addps  16(%4,%0), %%xmm1 \n\t"
3097
            "movss     %%xmm0,   (%1) \n\t"
3098
            "movss     %%xmm1, 32(%1) \n\t"
3099
            "movhlps   %%xmm0, %%xmm2 \n\t"
3100
            "movhlps   %%xmm1, %%xmm3 \n\t"
3101
            "movss     %%xmm2, 16(%1) \n\t"
3102
            "movss     %%xmm3, 48(%1) \n\t"
3103
            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3104
            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3105
            "movss     %%xmm0,  8(%1) \n\t"
3106
            "movss     %%xmm1, 40(%1) \n\t"
3107
            "movhlps   %%xmm0, %%xmm2 \n\t"
3108
            "movhlps   %%xmm1, %%xmm3 \n\t"
3109
            "movss     %%xmm2, 24(%1) \n\t"
3110
            "movss     %%xmm3, 56(%1) \n\t"
3111
            "sub  $64, %1 \n\t"
3112
            "sub  $32, %0 \n\t"
3113
            "jge  1b \n\t"
3114
            :"+r"(i), "+r"(dst)
3115
            :"r"(src0), "r"(src1), "r"(src2)
3116
            :"memory"
3117
        );
3118
    }
3119
    else if(step == 1 && src3 == 0){
3120
        asm volatile(
3121
            "1: \n\t"
3122
            "movaps   (%2,%0), %%xmm0 \n\t"
3123
            "movaps 16(%2,%0), %%xmm1 \n\t"
3124
            "mulps    (%3,%0), %%xmm0 \n\t"
3125
            "mulps  16(%3,%0), %%xmm1 \n\t"
3126
            "addps    (%4,%0), %%xmm0 \n\t"
3127
            "addps  16(%4,%0), %%xmm1 \n\t"
3128
            "movaps %%xmm0,   (%1,%0) \n\t"
3129
            "movaps %%xmm1, 16(%1,%0) \n\t"
3130
            "sub  $32, %0 \n\t"
3131
            "jge  1b \n\t"
3132
            :"+r"(i)
3133
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3134
            :"memory"
3135
        );
3136
    }
3137
    else
3138
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3139
}
3140

    
3141
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3142
    // not bit-exact: pf2id uses different rounding than C and SSE
3143
    int i;
3144
    for(i=0; i<len; i+=4) {
3145
        asm volatile(
3146
            "pf2id       %1, %%mm0 \n\t"
3147
            "pf2id       %2, %%mm1 \n\t"
3148
            "packssdw %%mm1, %%mm0 \n\t"
3149
            "movq     %%mm0, %0    \n\t"
3150
            :"=m"(dst[i])
3151
            :"m"(src[i]), "m"(src[i+2])
3152
        );
3153
    }
3154
    asm volatile("femms");
3155
}
3156
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3157
    int i;
3158
    for(i=0; i<len; i+=4) {
3159
        asm volatile(
3160
            "cvtps2pi    %1, %%mm0 \n\t"
3161
            "cvtps2pi    %2, %%mm1 \n\t"
3162
            "packssdw %%mm1, %%mm0 \n\t"
3163
            "movq     %%mm0, %0    \n\t"
3164
            :"=m"(dst[i])
3165
            :"m"(src[i]), "m"(src[i+2])
3166
        );
3167
    }
3168
    asm volatile("emms");
3169
}
3170

    
3171
#ifdef CONFIG_SNOW_DECODER
3172
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3173
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3174
extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3175
extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3176
extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3177
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3178
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3179
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3180
#endif
3181

    
3182
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3183
{
3184
    mm_flags = mm_support();
3185

    
3186
    if (avctx->dsp_mask) {
3187
        if (avctx->dsp_mask & FF_MM_FORCE)
3188
            mm_flags |= (avctx->dsp_mask & 0xffff);
3189
        else
3190
            mm_flags &= ~(avctx->dsp_mask & 0xffff);
3191
    }
3192

    
3193
#if 0
3194
    av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3195
    if (mm_flags & MM_MMX)
3196
        av_log(avctx, AV_LOG_INFO, " mmx");
3197
    if (mm_flags & MM_MMXEXT)
3198
        av_log(avctx, AV_LOG_INFO, " mmxext");
3199
    if (mm_flags & MM_3DNOW)
3200
        av_log(avctx, AV_LOG_INFO, " 3dnow");
3201
    if (mm_flags & MM_SSE)
3202
        av_log(avctx, AV_LOG_INFO, " sse");
3203
    if (mm_flags & MM_SSE2)
3204
        av_log(avctx, AV_LOG_INFO, " sse2");
3205
    av