Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 9fa35729

History | View | Annotate | Download (139 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "dsputil.h"
26
#include "dsputil_mmx.h"
27
#include "simple_idct.h"
28
#include "mpegvideo.h"
29
#include "x86_cpu.h"
30
#include "mmx.h"
31
#include "vp3dsp_mmx.h"
32
#include "vp3dsp_sse2.h"
33
#include "h263.h"
34

    
35
//#undef NDEBUG
36
//#include <assert.h>
37

    
38
extern void ff_idct_xvid_mmx(short *block);
39
extern void ff_idct_xvid_mmx2(short *block);
40

    
41
int mm_flags; /* multimedia extension flags */
42

    
43
/* pixel operations */
44
DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45
DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
46

    
47
DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48
{0x8000000080000000ULL, 0x8000000080000000ULL};
49

    
50
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
51
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
52
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5  ) = 0x0005000500050005ULL;
53
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8  ) = 0x0008000800080008ULL;
54
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
56
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL;
58
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61
DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62

    
63
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
64
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
65
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
66
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
67
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
68
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
69

    
70
DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71
DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
72

    
73
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
75

    
76
#define MOVQ_WONE(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79
    "psrlw $15, %%" #regd ::)
80

    
81
#define MOVQ_BFE(regd) \
82
    __asm __volatile ( \
83
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
85

    
86
#ifndef PIC
87
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
89
#else
90
// for shared library it's better to use this way for accessing constants
91
// pcmpeqd -> -1
92
#define MOVQ_BONE(regd) \
93
    __asm __volatile ( \
94
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95
    "psrlw $15, %%" #regd " \n\t" \
96
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
97

    
98
#define MOVQ_WTWO(regd) \
99
    __asm __volatile ( \
100
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101
    "psrlw $15, %%" #regd " \n\t" \
102
    "psllw $1, %%" #regd " \n\t"::)
103

    
104
#endif
105

    
106
// using regr as temporary and for the output result
107
// first argument is unmodifed and second is trashed
108
// regfe is supposed to contain 0xfefefefefefefefe
109
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110
    "movq " #rega ", " #regr "  \n\t"\
111
    "pand " #regb ", " #regr "  \n\t"\
112
    "pxor " #rega ", " #regb "  \n\t"\
113
    "pand " #regfe "," #regb "  \n\t"\
114
    "psrlq $1, " #regb "        \n\t"\
115
    "paddb " #regb ", " #regr " \n\t"
116

    
117
#define PAVGB_MMX(rega, regb, regr, regfe) \
118
    "movq " #rega ", " #regr "  \n\t"\
119
    "por  " #regb ", " #regr "  \n\t"\
120
    "pxor " #rega ", " #regb "  \n\t"\
121
    "pand " #regfe "," #regb "  \n\t"\
122
    "psrlq $1, " #regb "        \n\t"\
123
    "psubb " #regb ", " #regr " \n\t"
124

    
125
// mm6 is supposed to contain 0xfefefefefefefefe
126
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
127
    "movq " #rega ", " #regr "  \n\t"\
128
    "movq " #regc ", " #regp "  \n\t"\
129
    "pand " #regb ", " #regr "  \n\t"\
130
    "pand " #regd ", " #regp "  \n\t"\
131
    "pxor " #rega ", " #regb "  \n\t"\
132
    "pxor " #regc ", " #regd "  \n\t"\
133
    "pand %%mm6, " #regb "      \n\t"\
134
    "pand %%mm6, " #regd "      \n\t"\
135
    "psrlq $1, " #regb "        \n\t"\
136
    "psrlq $1, " #regd "        \n\t"\
137
    "paddb " #regb ", " #regr " \n\t"\
138
    "paddb " #regd ", " #regp " \n\t"
139

    
140
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141
    "movq " #rega ", " #regr "  \n\t"\
142
    "movq " #regc ", " #regp "  \n\t"\
143
    "por  " #regb ", " #regr "  \n\t"\
144
    "por  " #regd ", " #regp "  \n\t"\
145
    "pxor " #rega ", " #regb "  \n\t"\
146
    "pxor " #regc ", " #regd "  \n\t"\
147
    "pand %%mm6, " #regb "      \n\t"\
148
    "pand %%mm6, " #regd "      \n\t"\
149
    "psrlq $1, " #regd "        \n\t"\
150
    "psrlq $1, " #regb "        \n\t"\
151
    "psubb " #regb ", " #regr " \n\t"\
152
    "psubb " #regd ", " #regp " \n\t"
153

    
154
/***********************************/
155
/* MMX no rounding */
156
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157
#define SET_RND  MOVQ_WONE
158
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
160

    
161
#include "dsputil_mmx_rnd.h"
162

    
163
#undef DEF
164
#undef SET_RND
165
#undef PAVGBP
166
#undef PAVGB
167
/***********************************/
168
/* MMX rounding */
169

    
170
#define DEF(x, y) x ## _ ## y ##_mmx
171
#define SET_RND  MOVQ_WTWO
172
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
173
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
174

    
175
#include "dsputil_mmx_rnd.h"
176

    
177
#undef DEF
178
#undef SET_RND
179
#undef PAVGBP
180
#undef PAVGB
181

    
182
/***********************************/
183
/* 3Dnow specific */
184

    
185
#define DEF(x) x ## _3dnow
186
#define PAVGB "pavgusb"
187

    
188
#include "dsputil_mmx_avg.h"
189

    
190
#undef DEF
191
#undef PAVGB
192

    
193
/***********************************/
194
/* MMX2 specific */
195

    
196
#define DEF(x) x ## _mmx2
197

    
198
/* Introduced only in MMX2 set */
199
#define PAVGB "pavgb"
200

    
201
#include "dsputil_mmx_avg.h"
202

    
203
#undef DEF
204
#undef PAVGB
205

    
206
#define SBUTTERFLY(a,b,t,n,m)\
207
    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
208
    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
209
    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
210

    
211
#define TRANSPOSE4(a,b,c,d,t)\
212
    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
213
    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
214
    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
215
    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
216

    
217
/***********************************/
218
/* standard MMX */
219

    
220
#ifdef CONFIG_ENCODERS
221
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
222
{
223
    asm volatile(
224
        "mov $-128, %%"REG_a"           \n\t"
225
        "pxor %%mm7, %%mm7              \n\t"
226
        ASMALIGN(4)
227
        "1:                             \n\t"
228
        "movq (%0), %%mm0               \n\t"
229
        "movq (%0, %2), %%mm2           \n\t"
230
        "movq %%mm0, %%mm1              \n\t"
231
        "movq %%mm2, %%mm3              \n\t"
232
        "punpcklbw %%mm7, %%mm0         \n\t"
233
        "punpckhbw %%mm7, %%mm1         \n\t"
234
        "punpcklbw %%mm7, %%mm2         \n\t"
235
        "punpckhbw %%mm7, %%mm3         \n\t"
236
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
237
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
238
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
239
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
240
        "add %3, %0                     \n\t"
241
        "add $32, %%"REG_a"             \n\t"
242
        "js 1b                          \n\t"
243
        : "+r" (pixels)
244
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
245
        : "%"REG_a
246
    );
247
}
248

    
249
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
250
{
251
    asm volatile(
252
        "pxor %%mm7, %%mm7              \n\t"
253
        "mov $-128, %%"REG_a"           \n\t"
254
        ASMALIGN(4)
255
        "1:                             \n\t"
256
        "movq (%0), %%mm0               \n\t"
257
        "movq (%1), %%mm2               \n\t"
258
        "movq %%mm0, %%mm1              \n\t"
259
        "movq %%mm2, %%mm3              \n\t"
260
        "punpcklbw %%mm7, %%mm0         \n\t"
261
        "punpckhbw %%mm7, %%mm1         \n\t"
262
        "punpcklbw %%mm7, %%mm2         \n\t"
263
        "punpckhbw %%mm7, %%mm3         \n\t"
264
        "psubw %%mm2, %%mm0             \n\t"
265
        "psubw %%mm3, %%mm1             \n\t"
266
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
267
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
268
        "add %3, %0                     \n\t"
269
        "add %3, %1                     \n\t"
270
        "add $16, %%"REG_a"             \n\t"
271
        "jnz 1b                         \n\t"
272
        : "+r" (s1), "+r" (s2)
273
        : "r" (block+64), "r" ((long)stride)
274
        : "%"REG_a
275
    );
276
}
277
#endif //CONFIG_ENCODERS
278

    
279
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
280
{
281
    const DCTELEM *p;
282
    uint8_t *pix;
283

    
284
    /* read the pixels */
285
    p = block;
286
    pix = pixels;
287
    /* unrolled loop */
288
        __asm __volatile(
289
                "movq   %3, %%mm0               \n\t"
290
                "movq   8%3, %%mm1              \n\t"
291
                "movq   16%3, %%mm2             \n\t"
292
                "movq   24%3, %%mm3             \n\t"
293
                "movq   32%3, %%mm4             \n\t"
294
                "movq   40%3, %%mm5             \n\t"
295
                "movq   48%3, %%mm6             \n\t"
296
                "movq   56%3, %%mm7             \n\t"
297
                "packuswb %%mm1, %%mm0          \n\t"
298
                "packuswb %%mm3, %%mm2          \n\t"
299
                "packuswb %%mm5, %%mm4          \n\t"
300
                "packuswb %%mm7, %%mm6          \n\t"
301
                "movq   %%mm0, (%0)             \n\t"
302
                "movq   %%mm2, (%0, %1)         \n\t"
303
                "movq   %%mm4, (%0, %1, 2)      \n\t"
304
                "movq   %%mm6, (%0, %2)         \n\t"
305
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
306
                :"memory");
307
        pix += line_size*4;
308
        p += 32;
309

    
310
    // if here would be an exact copy of the code above
311
    // compiler would generate some very strange code
312
    // thus using "r"
313
    __asm __volatile(
314
            "movq       (%3), %%mm0             \n\t"
315
            "movq       8(%3), %%mm1            \n\t"
316
            "movq       16(%3), %%mm2           \n\t"
317
            "movq       24(%3), %%mm3           \n\t"
318
            "movq       32(%3), %%mm4           \n\t"
319
            "movq       40(%3), %%mm5           \n\t"
320
            "movq       48(%3), %%mm6           \n\t"
321
            "movq       56(%3), %%mm7           \n\t"
322
            "packuswb %%mm1, %%mm0              \n\t"
323
            "packuswb %%mm3, %%mm2              \n\t"
324
            "packuswb %%mm5, %%mm4              \n\t"
325
            "packuswb %%mm7, %%mm6              \n\t"
326
            "movq       %%mm0, (%0)             \n\t"
327
            "movq       %%mm2, (%0, %1)         \n\t"
328
            "movq       %%mm4, (%0, %1, 2)      \n\t"
329
            "movq       %%mm6, (%0, %2)         \n\t"
330
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
331
            :"memory");
332
}
333

    
334
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
335
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
336

    
337
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
338
{
339
    int i;
340

    
341
    movq_m2r(*vector128, mm1);
342
    for (i = 0; i < 8; i++) {
343
        movq_m2r(*(block), mm0);
344
        packsswb_m2r(*(block + 4), mm0);
345
        block += 8;
346
        paddb_r2r(mm1, mm0);
347
        movq_r2m(mm0, *pixels);
348
        pixels += line_size;
349
    }
350
}
351

    
352
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
353
{
354
    const DCTELEM *p;
355
    uint8_t *pix;
356
    int i;
357

    
358
    /* read the pixels */
359
    p = block;
360
    pix = pixels;
361
    MOVQ_ZERO(mm7);
362
    i = 4;
363
    do {
364
        __asm __volatile(
365
                "movq   (%2), %%mm0     \n\t"
366
                "movq   8(%2), %%mm1    \n\t"
367
                "movq   16(%2), %%mm2   \n\t"
368
                "movq   24(%2), %%mm3   \n\t"
369
                "movq   %0, %%mm4       \n\t"
370
                "movq   %1, %%mm6       \n\t"
371
                "movq   %%mm4, %%mm5    \n\t"
372
                "punpcklbw %%mm7, %%mm4 \n\t"
373
                "punpckhbw %%mm7, %%mm5 \n\t"
374
                "paddsw %%mm4, %%mm0    \n\t"
375
                "paddsw %%mm5, %%mm1    \n\t"
376
                "movq   %%mm6, %%mm5    \n\t"
377
                "punpcklbw %%mm7, %%mm6 \n\t"
378
                "punpckhbw %%mm7, %%mm5 \n\t"
379
                "paddsw %%mm6, %%mm2    \n\t"
380
                "paddsw %%mm5, %%mm3    \n\t"
381
                "packuswb %%mm1, %%mm0  \n\t"
382
                "packuswb %%mm3, %%mm2  \n\t"
383
                "movq   %%mm0, %0       \n\t"
384
                "movq   %%mm2, %1       \n\t"
385
                :"+m"(*pix), "+m"(*(pix+line_size))
386
                :"r"(p)
387
                :"memory");
388
        pix += line_size*2;
389
        p += 16;
390
    } while (--i);
391
}
392

    
393
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
394
{
395
    __asm __volatile(
396
         "lea (%3, %3), %%"REG_a"       \n\t"
397
         ASMALIGN(3)
398
         "1:                            \n\t"
399
         "movd (%1), %%mm0              \n\t"
400
         "movd (%1, %3), %%mm1          \n\t"
401
         "movd %%mm0, (%2)              \n\t"
402
         "movd %%mm1, (%2, %3)          \n\t"
403
         "add %%"REG_a", %1             \n\t"
404
         "add %%"REG_a", %2             \n\t"
405
         "movd (%1), %%mm0              \n\t"
406
         "movd (%1, %3), %%mm1          \n\t"
407
         "movd %%mm0, (%2)              \n\t"
408
         "movd %%mm1, (%2, %3)          \n\t"
409
         "add %%"REG_a", %1             \n\t"
410
         "add %%"REG_a", %2             \n\t"
411
         "subl $4, %0                   \n\t"
412
         "jnz 1b                        \n\t"
413
         : "+g"(h), "+r" (pixels),  "+r" (block)
414
         : "r"((long)line_size)
415
         : "%"REG_a, "memory"
416
        );
417
}
418

    
419
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
420
{
421
    __asm __volatile(
422
         "lea (%3, %3), %%"REG_a"       \n\t"
423
         ASMALIGN(3)
424
         "1:                            \n\t"
425
         "movq (%1), %%mm0              \n\t"
426
         "movq (%1, %3), %%mm1          \n\t"
427
         "movq %%mm0, (%2)              \n\t"
428
         "movq %%mm1, (%2, %3)          \n\t"
429
         "add %%"REG_a", %1             \n\t"
430
         "add %%"REG_a", %2             \n\t"
431
         "movq (%1), %%mm0              \n\t"
432
         "movq (%1, %3), %%mm1          \n\t"
433
         "movq %%mm0, (%2)              \n\t"
434
         "movq %%mm1, (%2, %3)          \n\t"
435
         "add %%"REG_a", %1             \n\t"
436
         "add %%"REG_a", %2             \n\t"
437
         "subl $4, %0                   \n\t"
438
         "jnz 1b                        \n\t"
439
         : "+g"(h), "+r" (pixels),  "+r" (block)
440
         : "r"((long)line_size)
441
         : "%"REG_a, "memory"
442
        );
443
}
444

    
445
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
446
{
447
    __asm __volatile(
448
         "lea (%3, %3), %%"REG_a"       \n\t"
449
         ASMALIGN(3)
450
         "1:                            \n\t"
451
         "movq (%1), %%mm0              \n\t"
452
         "movq 8(%1), %%mm4             \n\t"
453
         "movq (%1, %3), %%mm1          \n\t"
454
         "movq 8(%1, %3), %%mm5         \n\t"
455
         "movq %%mm0, (%2)              \n\t"
456
         "movq %%mm4, 8(%2)             \n\t"
457
         "movq %%mm1, (%2, %3)          \n\t"
458
         "movq %%mm5, 8(%2, %3)         \n\t"
459
         "add %%"REG_a", %1             \n\t"
460
         "add %%"REG_a", %2             \n\t"
461
         "movq (%1), %%mm0              \n\t"
462
         "movq 8(%1), %%mm4             \n\t"
463
         "movq (%1, %3), %%mm1          \n\t"
464
         "movq 8(%1, %3), %%mm5         \n\t"
465
         "movq %%mm0, (%2)              \n\t"
466
         "movq %%mm4, 8(%2)             \n\t"
467
         "movq %%mm1, (%2, %3)          \n\t"
468
         "movq %%mm5, 8(%2, %3)         \n\t"
469
         "add %%"REG_a", %1             \n\t"
470
         "add %%"REG_a", %2             \n\t"
471
         "subl $4, %0                   \n\t"
472
         "jnz 1b                        \n\t"
473
         : "+g"(h), "+r" (pixels),  "+r" (block)
474
         : "r"((long)line_size)
475
         : "%"REG_a, "memory"
476
        );
477
}
478

    
479
static void clear_blocks_mmx(DCTELEM *blocks)
480
{
481
    __asm __volatile(
482
                "pxor %%mm7, %%mm7              \n\t"
483
                "mov $-128*6, %%"REG_a"         \n\t"
484
                "1:                             \n\t"
485
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
486
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
487
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
488
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
489
                "add $32, %%"REG_a"             \n\t"
490
                " js 1b                         \n\t"
491
                : : "r" (((uint8_t *)blocks)+128*6)
492
                : "%"REG_a
493
        );
494
}
495

    
496
#ifdef CONFIG_ENCODERS
497
static int pix_sum16_mmx(uint8_t * pix, int line_size){
498
    const int h=16;
499
    int sum;
500
    long index= -line_size*h;
501

    
502
    __asm __volatile(
503
                "pxor %%mm7, %%mm7              \n\t"
504
                "pxor %%mm6, %%mm6              \n\t"
505
                "1:                             \n\t"
506
                "movq (%2, %1), %%mm0           \n\t"
507
                "movq (%2, %1), %%mm1           \n\t"
508
                "movq 8(%2, %1), %%mm2          \n\t"
509
                "movq 8(%2, %1), %%mm3          \n\t"
510
                "punpcklbw %%mm7, %%mm0         \n\t"
511
                "punpckhbw %%mm7, %%mm1         \n\t"
512
                "punpcklbw %%mm7, %%mm2         \n\t"
513
                "punpckhbw %%mm7, %%mm3         \n\t"
514
                "paddw %%mm0, %%mm1             \n\t"
515
                "paddw %%mm2, %%mm3             \n\t"
516
                "paddw %%mm1, %%mm3             \n\t"
517
                "paddw %%mm3, %%mm6             \n\t"
518
                "add %3, %1                     \n\t"
519
                " js 1b                         \n\t"
520
                "movq %%mm6, %%mm5              \n\t"
521
                "psrlq $32, %%mm6               \n\t"
522
                "paddw %%mm5, %%mm6             \n\t"
523
                "movq %%mm6, %%mm5              \n\t"
524
                "psrlq $16, %%mm6               \n\t"
525
                "paddw %%mm5, %%mm6             \n\t"
526
                "movd %%mm6, %0                 \n\t"
527
                "andl $0xFFFF, %0               \n\t"
528
                : "=&r" (sum), "+r" (index)
529
                : "r" (pix - index), "r" ((long)line_size)
530
        );
531

    
532
        return sum;
533
}
534
#endif //CONFIG_ENCODERS
535

    
536
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
537
    long i=0;
538
    asm volatile(
539
        "1:                             \n\t"
540
        "movq  (%1, %0), %%mm0          \n\t"
541
        "movq  (%2, %0), %%mm1          \n\t"
542
        "paddb %%mm0, %%mm1             \n\t"
543
        "movq %%mm1, (%2, %0)           \n\t"
544
        "movq 8(%1, %0), %%mm0          \n\t"
545
        "movq 8(%2, %0), %%mm1          \n\t"
546
        "paddb %%mm0, %%mm1             \n\t"
547
        "movq %%mm1, 8(%2, %0)          \n\t"
548
        "add $16, %0                    \n\t"
549
        "cmp %3, %0                     \n\t"
550
        " jb 1b                         \n\t"
551
        : "+r" (i)
552
        : "r"(src), "r"(dst), "r"((long)w-15)
553
    );
554
    for(; i<w; i++)
555
        dst[i+0] += src[i+0];
556
}
557

    
558
#define H263_LOOP_FILTER \
559
        "pxor %%mm7, %%mm7              \n\t"\
560
        "movq  %0, %%mm0                \n\t"\
561
        "movq  %0, %%mm1                \n\t"\
562
        "movq  %3, %%mm2                \n\t"\
563
        "movq  %3, %%mm3                \n\t"\
564
        "punpcklbw %%mm7, %%mm0         \n\t"\
565
        "punpckhbw %%mm7, %%mm1         \n\t"\
566
        "punpcklbw %%mm7, %%mm2         \n\t"\
567
        "punpckhbw %%mm7, %%mm3         \n\t"\
568
        "psubw %%mm2, %%mm0             \n\t"\
569
        "psubw %%mm3, %%mm1             \n\t"\
570
        "movq  %1, %%mm2                \n\t"\
571
        "movq  %1, %%mm3                \n\t"\
572
        "movq  %2, %%mm4                \n\t"\
573
        "movq  %2, %%mm5                \n\t"\
574
        "punpcklbw %%mm7, %%mm2         \n\t"\
575
        "punpckhbw %%mm7, %%mm3         \n\t"\
576
        "punpcklbw %%mm7, %%mm4         \n\t"\
577
        "punpckhbw %%mm7, %%mm5         \n\t"\
578
        "psubw %%mm2, %%mm4             \n\t"\
579
        "psubw %%mm3, %%mm5             \n\t"\
580
        "psllw $2, %%mm4                \n\t"\
581
        "psllw $2, %%mm5                \n\t"\
582
        "paddw %%mm0, %%mm4             \n\t"\
583
        "paddw %%mm1, %%mm5             \n\t"\
584
        "pxor %%mm6, %%mm6              \n\t"\
585
        "pcmpgtw %%mm4, %%mm6           \n\t"\
586
        "pcmpgtw %%mm5, %%mm7           \n\t"\
587
        "pxor %%mm6, %%mm4              \n\t"\
588
        "pxor %%mm7, %%mm5              \n\t"\
589
        "psubw %%mm6, %%mm4             \n\t"\
590
        "psubw %%mm7, %%mm5             \n\t"\
591
        "psrlw $3, %%mm4                \n\t"\
592
        "psrlw $3, %%mm5                \n\t"\
593
        "packuswb %%mm5, %%mm4          \n\t"\
594
        "packsswb %%mm7, %%mm6          \n\t"\
595
        "pxor %%mm7, %%mm7              \n\t"\
596
        "movd %4, %%mm2                 \n\t"\
597
        "punpcklbw %%mm2, %%mm2         \n\t"\
598
        "punpcklbw %%mm2, %%mm2         \n\t"\
599
        "punpcklbw %%mm2, %%mm2         \n\t"\
600
        "psubusb %%mm4, %%mm2           \n\t"\
601
        "movq %%mm2, %%mm3              \n\t"\
602
        "psubusb %%mm4, %%mm3           \n\t"\
603
        "psubb %%mm3, %%mm2             \n\t"\
604
        "movq %1, %%mm3                 \n\t"\
605
        "movq %2, %%mm4                 \n\t"\
606
        "pxor %%mm6, %%mm3              \n\t"\
607
        "pxor %%mm6, %%mm4              \n\t"\
608
        "paddusb %%mm2, %%mm3           \n\t"\
609
        "psubusb %%mm2, %%mm4           \n\t"\
610
        "pxor %%mm6, %%mm3              \n\t"\
611
        "pxor %%mm6, %%mm4              \n\t"\
612
        "paddusb %%mm2, %%mm2           \n\t"\
613
        "packsswb %%mm1, %%mm0          \n\t"\
614
        "pcmpgtb %%mm0, %%mm7           \n\t"\
615
        "pxor %%mm7, %%mm0              \n\t"\
616
        "psubb %%mm7, %%mm0             \n\t"\
617
        "movq %%mm0, %%mm1              \n\t"\
618
        "psubusb %%mm2, %%mm0           \n\t"\
619
        "psubb %%mm0, %%mm1             \n\t"\
620
        "pand %5, %%mm1                 \n\t"\
621
        "psrlw $2, %%mm1                \n\t"\
622
        "pxor %%mm7, %%mm1              \n\t"\
623
        "psubb %%mm7, %%mm1             \n\t"\
624
        "movq %0, %%mm5                 \n\t"\
625
        "movq %3, %%mm6                 \n\t"\
626
        "psubb %%mm1, %%mm5             \n\t"\
627
        "paddb %%mm1, %%mm6             \n\t"
628

    
629
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
630
    if(ENABLE_ANY_H263) {
631
    const int strength= ff_h263_loop_filter_strength[qscale];
632

    
633
    asm volatile(
634

    
635
        H263_LOOP_FILTER
636

    
637
        "movq %%mm3, %1                 \n\t"
638
        "movq %%mm4, %2                 \n\t"
639
        "movq %%mm5, %0                 \n\t"
640
        "movq %%mm6, %3                 \n\t"
641
        : "+m" (*(uint64_t*)(src - 2*stride)),
642
          "+m" (*(uint64_t*)(src - 1*stride)),
643
          "+m" (*(uint64_t*)(src + 0*stride)),
644
          "+m" (*(uint64_t*)(src + 1*stride))
645
        : "g" (2*strength), "m"(ff_pb_FC)
646
    );
647
    }
648
}
649

    
650
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
651
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
652
        "movd  %4, %%mm0                \n\t"
653
        "movd  %5, %%mm1                \n\t"
654
        "movd  %6, %%mm2                \n\t"
655
        "movd  %7, %%mm3                \n\t"
656
        "punpcklbw %%mm1, %%mm0         \n\t"
657
        "punpcklbw %%mm3, %%mm2         \n\t"
658
        "movq %%mm0, %%mm1              \n\t"
659
        "punpcklwd %%mm2, %%mm0         \n\t"
660
        "punpckhwd %%mm2, %%mm1         \n\t"
661
        "movd  %%mm0, %0                \n\t"
662
        "punpckhdq %%mm0, %%mm0         \n\t"
663
        "movd  %%mm0, %1                \n\t"
664
        "movd  %%mm1, %2                \n\t"
665
        "punpckhdq %%mm1, %%mm1         \n\t"
666
        "movd  %%mm1, %3                \n\t"
667

    
668
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
669
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
670
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
671
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
672
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
673
           "m" (*(uint32_t*)(src + 1*src_stride)),
674
           "m" (*(uint32_t*)(src + 2*src_stride)),
675
           "m" (*(uint32_t*)(src + 3*src_stride))
676
    );
677
}
678

    
679
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
680
    if(ENABLE_ANY_H263) {
681
    const int strength= ff_h263_loop_filter_strength[qscale];
682
    uint64_t temp[4] __attribute__ ((aligned(8)));
683
    uint8_t *btemp= (uint8_t*)temp;
684

    
685
    src -= 2;
686

    
687
    transpose4x4(btemp  , src           , 8, stride);
688
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
689
    asm volatile(
690
        H263_LOOP_FILTER // 5 3 4 6
691

    
692
        : "+m" (temp[0]),
693
          "+m" (temp[1]),
694
          "+m" (temp[2]),
695
          "+m" (temp[3])
696
        : "g" (2*strength), "m"(ff_pb_FC)
697
    );
698

    
699
    asm volatile(
700
        "movq %%mm5, %%mm1              \n\t"
701
        "movq %%mm4, %%mm0              \n\t"
702
        "punpcklbw %%mm3, %%mm5         \n\t"
703
        "punpcklbw %%mm6, %%mm4         \n\t"
704
        "punpckhbw %%mm3, %%mm1         \n\t"
705
        "punpckhbw %%mm6, %%mm0         \n\t"
706
        "movq %%mm5, %%mm3              \n\t"
707
        "movq %%mm1, %%mm6              \n\t"
708
        "punpcklwd %%mm4, %%mm5         \n\t"
709
        "punpcklwd %%mm0, %%mm1         \n\t"
710
        "punpckhwd %%mm4, %%mm3         \n\t"
711
        "punpckhwd %%mm0, %%mm6         \n\t"
712
        "movd %%mm5, (%0)               \n\t"
713
        "punpckhdq %%mm5, %%mm5         \n\t"
714
        "movd %%mm5, (%0,%2)            \n\t"
715
        "movd %%mm3, (%0,%2,2)          \n\t"
716
        "punpckhdq %%mm3, %%mm3         \n\t"
717
        "movd %%mm3, (%0,%3)            \n\t"
718
        "movd %%mm1, (%1)               \n\t"
719
        "punpckhdq %%mm1, %%mm1         \n\t"
720
        "movd %%mm1, (%1,%2)            \n\t"
721
        "movd %%mm6, (%1,%2,2)          \n\t"
722
        "punpckhdq %%mm6, %%mm6         \n\t"
723
        "movd %%mm6, (%1,%3)            \n\t"
724
        :: "r" (src),
725
           "r" (src + 4*stride),
726
           "r" ((long)   stride ),
727
           "r" ((long)(3*stride))
728
    );
729
    }
730
}
731

    
732
#ifdef CONFIG_ENCODERS
733
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
734
    int tmp;
735
  asm volatile (
736
      "movl $16,%%ecx\n"
737
      "pxor %%mm0,%%mm0\n"
738
      "pxor %%mm7,%%mm7\n"
739
      "1:\n"
740
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
741
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
742

    
743
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
744

    
745
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
746
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
747

    
748
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
749
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
750
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
751

    
752
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
753
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
754

    
755
      "pmaddwd %%mm3,%%mm3\n"
756
      "pmaddwd %%mm4,%%mm4\n"
757

    
758
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
759
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
760
      "paddd %%mm3,%%mm4\n"
761
      "paddd %%mm2,%%mm7\n"
762

    
763
      "add %2, %0\n"
764
      "paddd %%mm4,%%mm7\n"
765
      "dec %%ecx\n"
766
      "jnz 1b\n"
767

    
768
      "movq %%mm7,%%mm1\n"
769
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
770
      "paddd %%mm7,%%mm1\n"
771
      "movd %%mm1,%1\n"
772
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
773
    return tmp;
774
}
775

    
776
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
777
    int tmp;
778
  asm volatile (
779
      "movl %4,%%ecx\n"
780
      "shr $1,%%ecx\n"
781
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
782
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
783
      "1:\n"
784
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
785
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
786
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
787
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
788

    
789
      /* todo: mm1-mm2, mm3-mm4 */
790
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
791
      /*       OR the results to get absolute difference */
792
      "movq %%mm1,%%mm5\n"
793
      "movq %%mm3,%%mm6\n"
794
      "psubusb %%mm2,%%mm1\n"
795
      "psubusb %%mm4,%%mm3\n"
796
      "psubusb %%mm5,%%mm2\n"
797
      "psubusb %%mm6,%%mm4\n"
798

    
799
      "por %%mm1,%%mm2\n"
800
      "por %%mm3,%%mm4\n"
801

    
802
      /* now convert to 16-bit vectors so we can square them */
803
      "movq %%mm2,%%mm1\n"
804
      "movq %%mm4,%%mm3\n"
805

    
806
      "punpckhbw %%mm0,%%mm2\n"
807
      "punpckhbw %%mm0,%%mm4\n"
808
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
809
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
810

    
811
      "pmaddwd %%mm2,%%mm2\n"
812
      "pmaddwd %%mm4,%%mm4\n"
813
      "pmaddwd %%mm1,%%mm1\n"
814
      "pmaddwd %%mm3,%%mm3\n"
815

    
816
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
817
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
818

    
819
      "paddd %%mm2,%%mm1\n"
820
      "paddd %%mm4,%%mm3\n"
821
      "paddd %%mm1,%%mm7\n"
822
      "paddd %%mm3,%%mm7\n"
823

    
824
      "decl %%ecx\n"
825
      "jnz 1b\n"
826

    
827
      "movq %%mm7,%%mm1\n"
828
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
829
      "paddd %%mm7,%%mm1\n"
830
      "movd %%mm1,%2\n"
831
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
832
      : "r" ((long)line_size) , "m" (h)
833
      : "%ecx");
834
    return tmp;
835
}
836

    
837
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
838
    int tmp;
839
  asm volatile (
840
      "movl %4,%%ecx\n"
841
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
842
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
843
      "1:\n"
844
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
845
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
846
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
847
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
848

    
849
      /* todo: mm1-mm2, mm3-mm4 */
850
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
851
      /*       OR the results to get absolute difference */
852
      "movq %%mm1,%%mm5\n"
853
      "movq %%mm3,%%mm6\n"
854
      "psubusb %%mm2,%%mm1\n"
855
      "psubusb %%mm4,%%mm3\n"
856
      "psubusb %%mm5,%%mm2\n"
857
      "psubusb %%mm6,%%mm4\n"
858

    
859
      "por %%mm1,%%mm2\n"
860
      "por %%mm3,%%mm4\n"
861

    
862
      /* now convert to 16-bit vectors so we can square them */
863
      "movq %%mm2,%%mm1\n"
864
      "movq %%mm4,%%mm3\n"
865

    
866
      "punpckhbw %%mm0,%%mm2\n"
867
      "punpckhbw %%mm0,%%mm4\n"
868
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
869
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
870

    
871
      "pmaddwd %%mm2,%%mm2\n"
872
      "pmaddwd %%mm4,%%mm4\n"
873
      "pmaddwd %%mm1,%%mm1\n"
874
      "pmaddwd %%mm3,%%mm3\n"
875

    
876
      "add %3,%0\n"
877
      "add %3,%1\n"
878

    
879
      "paddd %%mm2,%%mm1\n"
880
      "paddd %%mm4,%%mm3\n"
881
      "paddd %%mm1,%%mm7\n"
882
      "paddd %%mm3,%%mm7\n"
883

    
884
      "decl %%ecx\n"
885
      "jnz 1b\n"
886

    
887
      "movq %%mm7,%%mm1\n"
888
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
889
      "paddd %%mm7,%%mm1\n"
890
      "movd %%mm1,%2\n"
891
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
892
      : "r" ((long)line_size) , "m" (h)
893
      : "%ecx");
894
    return tmp;
895
}
896

    
897
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
898
    int tmp;
899
  asm volatile (
900
      "shr $1,%2\n"
901
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
902
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
903
      "1:\n"
904
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
905
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
906
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
907
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
908

    
909
      /* todo: mm1-mm2, mm3-mm4 */
910
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
911
      /*       OR the results to get absolute difference */
912
      "movdqa %%xmm1,%%xmm5\n"
913
      "movdqa %%xmm3,%%xmm6\n"
914
      "psubusb %%xmm2,%%xmm1\n"
915
      "psubusb %%xmm4,%%xmm3\n"
916
      "psubusb %%xmm5,%%xmm2\n"
917
      "psubusb %%xmm6,%%xmm4\n"
918

    
919
      "por %%xmm1,%%xmm2\n"
920
      "por %%xmm3,%%xmm4\n"
921

    
922
      /* now convert to 16-bit vectors so we can square them */
923
      "movdqa %%xmm2,%%xmm1\n"
924
      "movdqa %%xmm4,%%xmm3\n"
925

    
926
      "punpckhbw %%xmm0,%%xmm2\n"
927
      "punpckhbw %%xmm0,%%xmm4\n"
928
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
929
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
930

    
931
      "pmaddwd %%xmm2,%%xmm2\n"
932
      "pmaddwd %%xmm4,%%xmm4\n"
933
      "pmaddwd %%xmm1,%%xmm1\n"
934
      "pmaddwd %%xmm3,%%xmm3\n"
935

    
936
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
937
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
938

    
939
      "paddd %%xmm2,%%xmm1\n"
940
      "paddd %%xmm4,%%xmm3\n"
941
      "paddd %%xmm1,%%xmm7\n"
942
      "paddd %%xmm3,%%xmm7\n"
943

    
944
      "decl %2\n"
945
      "jnz 1b\n"
946

    
947
      "movdqa %%xmm7,%%xmm1\n"
948
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
949
      "paddd %%xmm1,%%xmm7\n"
950
      "movdqa %%xmm7,%%xmm1\n"
951
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
952
      "paddd %%xmm1,%%xmm7\n"
953
      "movd %%xmm7,%3\n"
954
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
955
      : "r" ((long)line_size));
956
    return tmp;
957
}
958

    
959
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
960
    int tmp;
961
  asm volatile (
962
      "movl %3,%%ecx\n"
963
      "pxor %%mm7,%%mm7\n"
964
      "pxor %%mm6,%%mm6\n"
965

    
966
      "movq (%0),%%mm0\n"
967
      "movq %%mm0, %%mm1\n"
968
      "psllq $8, %%mm0\n"
969
      "psrlq $8, %%mm1\n"
970
      "psrlq $8, %%mm0\n"
971
      "movq %%mm0, %%mm2\n"
972
      "movq %%mm1, %%mm3\n"
973
      "punpcklbw %%mm7,%%mm0\n"
974
      "punpcklbw %%mm7,%%mm1\n"
975
      "punpckhbw %%mm7,%%mm2\n"
976
      "punpckhbw %%mm7,%%mm3\n"
977
      "psubw %%mm1, %%mm0\n"
978
      "psubw %%mm3, %%mm2\n"
979

    
980
      "add %2,%0\n"
981

    
982
      "movq (%0),%%mm4\n"
983
      "movq %%mm4, %%mm1\n"
984
      "psllq $8, %%mm4\n"
985
      "psrlq $8, %%mm1\n"
986
      "psrlq $8, %%mm4\n"
987
      "movq %%mm4, %%mm5\n"
988
      "movq %%mm1, %%mm3\n"
989
      "punpcklbw %%mm7,%%mm4\n"
990
      "punpcklbw %%mm7,%%mm1\n"
991
      "punpckhbw %%mm7,%%mm5\n"
992
      "punpckhbw %%mm7,%%mm3\n"
993
      "psubw %%mm1, %%mm4\n"
994
      "psubw %%mm3, %%mm5\n"
995
      "psubw %%mm4, %%mm0\n"
996
      "psubw %%mm5, %%mm2\n"
997
      "pxor %%mm3, %%mm3\n"
998
      "pxor %%mm1, %%mm1\n"
999
      "pcmpgtw %%mm0, %%mm3\n\t"
1000
      "pcmpgtw %%mm2, %%mm1\n\t"
1001
      "pxor %%mm3, %%mm0\n"
1002
      "pxor %%mm1, %%mm2\n"
1003
      "psubw %%mm3, %%mm0\n"
1004
      "psubw %%mm1, %%mm2\n"
1005
      "paddw %%mm0, %%mm2\n"
1006
      "paddw %%mm2, %%mm6\n"
1007

    
1008
      "add %2,%0\n"
1009
      "1:\n"
1010

    
1011
      "movq (%0),%%mm0\n"
1012
      "movq %%mm0, %%mm1\n"
1013
      "psllq $8, %%mm0\n"
1014
      "psrlq $8, %%mm1\n"
1015
      "psrlq $8, %%mm0\n"
1016
      "movq %%mm0, %%mm2\n"
1017
      "movq %%mm1, %%mm3\n"
1018
      "punpcklbw %%mm7,%%mm0\n"
1019
      "punpcklbw %%mm7,%%mm1\n"
1020
      "punpckhbw %%mm7,%%mm2\n"
1021
      "punpckhbw %%mm7,%%mm3\n"
1022
      "psubw %%mm1, %%mm0\n"
1023
      "psubw %%mm3, %%mm2\n"
1024
      "psubw %%mm0, %%mm4\n"
1025
      "psubw %%mm2, %%mm5\n"
1026
      "pxor %%mm3, %%mm3\n"
1027
      "pxor %%mm1, %%mm1\n"
1028
      "pcmpgtw %%mm4, %%mm3\n\t"
1029
      "pcmpgtw %%mm5, %%mm1\n\t"
1030
      "pxor %%mm3, %%mm4\n"
1031
      "pxor %%mm1, %%mm5\n"
1032
      "psubw %%mm3, %%mm4\n"
1033
      "psubw %%mm1, %%mm5\n"
1034
      "paddw %%mm4, %%mm5\n"
1035
      "paddw %%mm5, %%mm6\n"
1036

    
1037
      "add %2,%0\n"
1038

    
1039
      "movq (%0),%%mm4\n"
1040
      "movq %%mm4, %%mm1\n"
1041
      "psllq $8, %%mm4\n"
1042
      "psrlq $8, %%mm1\n"
1043
      "psrlq $8, %%mm4\n"
1044
      "movq %%mm4, %%mm5\n"
1045
      "movq %%mm1, %%mm3\n"
1046
      "punpcklbw %%mm7,%%mm4\n"
1047
      "punpcklbw %%mm7,%%mm1\n"
1048
      "punpckhbw %%mm7,%%mm5\n"
1049
      "punpckhbw %%mm7,%%mm3\n"
1050
      "psubw %%mm1, %%mm4\n"
1051
      "psubw %%mm3, %%mm5\n"
1052
      "psubw %%mm4, %%mm0\n"
1053
      "psubw %%mm5, %%mm2\n"
1054
      "pxor %%mm3, %%mm3\n"
1055
      "pxor %%mm1, %%mm1\n"
1056
      "pcmpgtw %%mm0, %%mm3\n\t"
1057
      "pcmpgtw %%mm2, %%mm1\n\t"
1058
      "pxor %%mm3, %%mm0\n"
1059
      "pxor %%mm1, %%mm2\n"
1060
      "psubw %%mm3, %%mm0\n"
1061
      "psubw %%mm1, %%mm2\n"
1062
      "paddw %%mm0, %%mm2\n"
1063
      "paddw %%mm2, %%mm6\n"
1064

    
1065
      "add %2,%0\n"
1066
      "subl $2, %%ecx\n"
1067
      " jnz 1b\n"
1068

    
1069
      "movq %%mm6, %%mm0\n"
1070
      "punpcklwd %%mm7,%%mm0\n"
1071
      "punpckhwd %%mm7,%%mm6\n"
1072
      "paddd %%mm0, %%mm6\n"
1073

    
1074
      "movq %%mm6,%%mm0\n"
1075
      "psrlq $32, %%mm6\n"
1076
      "paddd %%mm6,%%mm0\n"
1077
      "movd %%mm0,%1\n"
1078
      : "+r" (pix1), "=r"(tmp)
1079
      : "r" ((long)line_size) , "g" (h-2)
1080
      : "%ecx");
1081
      return tmp;
1082
}
1083

    
1084
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1085
    int tmp;
1086
    uint8_t * pix= pix1;
1087
  asm volatile (
1088
      "movl %3,%%ecx\n"
1089
      "pxor %%mm7,%%mm7\n"
1090
      "pxor %%mm6,%%mm6\n"
1091

    
1092
      "movq (%0),%%mm0\n"
1093
      "movq 1(%0),%%mm1\n"
1094
      "movq %%mm0, %%mm2\n"
1095
      "movq %%mm1, %%mm3\n"
1096
      "punpcklbw %%mm7,%%mm0\n"
1097
      "punpcklbw %%mm7,%%mm1\n"
1098
      "punpckhbw %%mm7,%%mm2\n"
1099
      "punpckhbw %%mm7,%%mm3\n"
1100
      "psubw %%mm1, %%mm0\n"
1101
      "psubw %%mm3, %%mm2\n"
1102

    
1103
      "add %2,%0\n"
1104

    
1105
      "movq (%0),%%mm4\n"
1106
      "movq 1(%0),%%mm1\n"
1107
      "movq %%mm4, %%mm5\n"
1108
      "movq %%mm1, %%mm3\n"
1109
      "punpcklbw %%mm7,%%mm4\n"
1110
      "punpcklbw %%mm7,%%mm1\n"
1111
      "punpckhbw %%mm7,%%mm5\n"
1112
      "punpckhbw %%mm7,%%mm3\n"
1113
      "psubw %%mm1, %%mm4\n"
1114
      "psubw %%mm3, %%mm5\n"
1115
      "psubw %%mm4, %%mm0\n"
1116
      "psubw %%mm5, %%mm2\n"
1117
      "pxor %%mm3, %%mm3\n"
1118
      "pxor %%mm1, %%mm1\n"
1119
      "pcmpgtw %%mm0, %%mm3\n\t"
1120
      "pcmpgtw %%mm2, %%mm1\n\t"
1121
      "pxor %%mm3, %%mm0\n"
1122
      "pxor %%mm1, %%mm2\n"
1123
      "psubw %%mm3, %%mm0\n"
1124
      "psubw %%mm1, %%mm2\n"
1125
      "paddw %%mm0, %%mm2\n"
1126
      "paddw %%mm2, %%mm6\n"
1127

    
1128
      "add %2,%0\n"
1129
      "1:\n"
1130

    
1131
      "movq (%0),%%mm0\n"
1132
      "movq 1(%0),%%mm1\n"
1133
      "movq %%mm0, %%mm2\n"
1134
      "movq %%mm1, %%mm3\n"
1135
      "punpcklbw %%mm7,%%mm0\n"
1136
      "punpcklbw %%mm7,%%mm1\n"
1137
      "punpckhbw %%mm7,%%mm2\n"
1138
      "punpckhbw %%mm7,%%mm3\n"
1139
      "psubw %%mm1, %%mm0\n"
1140
      "psubw %%mm3, %%mm2\n"
1141
      "psubw %%mm0, %%mm4\n"
1142
      "psubw %%mm2, %%mm5\n"
1143
      "pxor %%mm3, %%mm3\n"
1144
      "pxor %%mm1, %%mm1\n"
1145
      "pcmpgtw %%mm4, %%mm3\n\t"
1146
      "pcmpgtw %%mm5, %%mm1\n\t"
1147
      "pxor %%mm3, %%mm4\n"
1148
      "pxor %%mm1, %%mm5\n"
1149
      "psubw %%mm3, %%mm4\n"
1150
      "psubw %%mm1, %%mm5\n"
1151
      "paddw %%mm4, %%mm5\n"
1152
      "paddw %%mm5, %%mm6\n"
1153

    
1154
      "add %2,%0\n"
1155

    
1156
      "movq (%0),%%mm4\n"
1157
      "movq 1(%0),%%mm1\n"
1158
      "movq %%mm4, %%mm5\n"
1159
      "movq %%mm1, %%mm3\n"
1160
      "punpcklbw %%mm7,%%mm4\n"
1161
      "punpcklbw %%mm7,%%mm1\n"
1162
      "punpckhbw %%mm7,%%mm5\n"
1163
      "punpckhbw %%mm7,%%mm3\n"
1164
      "psubw %%mm1, %%mm4\n"
1165
      "psubw %%mm3, %%mm5\n"
1166
      "psubw %%mm4, %%mm0\n"
1167
      "psubw %%mm5, %%mm2\n"
1168
      "pxor %%mm3, %%mm3\n"
1169
      "pxor %%mm1, %%mm1\n"
1170
      "pcmpgtw %%mm0, %%mm3\n\t"
1171
      "pcmpgtw %%mm2, %%mm1\n\t"
1172
      "pxor %%mm3, %%mm0\n"
1173
      "pxor %%mm1, %%mm2\n"
1174
      "psubw %%mm3, %%mm0\n"
1175
      "psubw %%mm1, %%mm2\n"
1176
      "paddw %%mm0, %%mm2\n"
1177
      "paddw %%mm2, %%mm6\n"
1178

    
1179
      "add %2,%0\n"
1180
      "subl $2, %%ecx\n"
1181
      " jnz 1b\n"
1182

    
1183
      "movq %%mm6, %%mm0\n"
1184
      "punpcklwd %%mm7,%%mm0\n"
1185
      "punpckhwd %%mm7,%%mm6\n"
1186
      "paddd %%mm0, %%mm6\n"
1187

    
1188
      "movq %%mm6,%%mm0\n"
1189
      "psrlq $32, %%mm6\n"
1190
      "paddd %%mm6,%%mm0\n"
1191
      "movd %%mm0,%1\n"
1192
      : "+r" (pix1), "=r"(tmp)
1193
      : "r" ((long)line_size) , "g" (h-2)
1194
      : "%ecx");
1195
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1196
}
1197

    
1198
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1199
    MpegEncContext *c = p;
1200
    int score1, score2;
1201

    
1202
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1203
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1204
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1205

    
1206
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1207
    else  return score1 + FFABS(score2)*8;
1208
}
1209

    
1210
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1211
    MpegEncContext *c = p;
1212
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1213
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1214

    
1215
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1216
    else  return score1 + FFABS(score2)*8;
1217
}
1218

    
1219
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1220
    int tmp;
1221

    
1222
    assert( (((int)pix) & 7) == 0);
1223
    assert((line_size &7) ==0);
1224

    
1225
#define SUM(in0, in1, out0, out1) \
1226
      "movq (%0), %%mm2\n"\
1227
      "movq 8(%0), %%mm3\n"\
1228
      "add %2,%0\n"\
1229
      "movq %%mm2, " #out0 "\n"\
1230
      "movq %%mm3, " #out1 "\n"\
1231
      "psubusb " #in0 ", %%mm2\n"\
1232
      "psubusb " #in1 ", %%mm3\n"\
1233
      "psubusb " #out0 ", " #in0 "\n"\
1234
      "psubusb " #out1 ", " #in1 "\n"\
1235
      "por %%mm2, " #in0 "\n"\
1236
      "por %%mm3, " #in1 "\n"\
1237
      "movq " #in0 ", %%mm2\n"\
1238
      "movq " #in1 ", %%mm3\n"\
1239
      "punpcklbw %%mm7, " #in0 "\n"\
1240
      "punpcklbw %%mm7, " #in1 "\n"\
1241
      "punpckhbw %%mm7, %%mm2\n"\
1242
      "punpckhbw %%mm7, %%mm3\n"\
1243
      "paddw " #in1 ", " #in0 "\n"\
1244
      "paddw %%mm3, %%mm2\n"\
1245
      "paddw %%mm2, " #in0 "\n"\
1246
      "paddw " #in0 ", %%mm6\n"
1247

    
1248

    
1249
  asm volatile (
1250
      "movl %3,%%ecx\n"
1251
      "pxor %%mm6,%%mm6\n"
1252
      "pxor %%mm7,%%mm7\n"
1253
      "movq (%0),%%mm0\n"
1254
      "movq 8(%0),%%mm1\n"
1255
      "add %2,%0\n"
1256
      "subl $2, %%ecx\n"
1257
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1258
      "1:\n"
1259

    
1260
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1261

    
1262
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1263

    
1264
      "subl $2, %%ecx\n"
1265
      "jnz 1b\n"
1266

    
1267
      "movq %%mm6,%%mm0\n"
1268
      "psrlq $32, %%mm6\n"
1269
      "paddw %%mm6,%%mm0\n"
1270
      "movq %%mm0,%%mm6\n"
1271
      "psrlq $16, %%mm0\n"
1272
      "paddw %%mm6,%%mm0\n"
1273
      "movd %%mm0,%1\n"
1274
      : "+r" (pix), "=r"(tmp)
1275
      : "r" ((long)line_size) , "m" (h)
1276
      : "%ecx");
1277
    return tmp & 0xFFFF;
1278
}
1279
#undef SUM
1280

    
1281
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1282
    int tmp;
1283

    
1284
    assert( (((int)pix) & 7) == 0);
1285
    assert((line_size &7) ==0);
1286

    
1287
#define SUM(in0, in1, out0, out1) \
1288
      "movq (%0), " #out0 "\n"\
1289
      "movq 8(%0), " #out1 "\n"\
1290
      "add %2,%0\n"\
1291
      "psadbw " #out0 ", " #in0 "\n"\
1292
      "psadbw " #out1 ", " #in1 "\n"\
1293
      "paddw " #in1 ", " #in0 "\n"\
1294
      "paddw " #in0 ", %%mm6\n"
1295

    
1296
  asm volatile (
1297
      "movl %3,%%ecx\n"
1298
      "pxor %%mm6,%%mm6\n"
1299
      "pxor %%mm7,%%mm7\n"
1300
      "movq (%0),%%mm0\n"
1301
      "movq 8(%0),%%mm1\n"
1302
      "add %2,%0\n"
1303
      "subl $2, %%ecx\n"
1304
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1305
      "1:\n"
1306

    
1307
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1308

    
1309
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1310

    
1311
      "subl $2, %%ecx\n"
1312
      "jnz 1b\n"
1313

    
1314
      "movd %%mm6,%1\n"
1315
      : "+r" (pix), "=r"(tmp)
1316
      : "r" ((long)line_size) , "m" (h)
1317
      : "%ecx");
1318
    return tmp;
1319
}
1320
#undef SUM
1321

    
1322
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1323
    int tmp;
1324

    
1325
    assert( (((int)pix1) & 7) == 0);
1326
    assert( (((int)pix2) & 7) == 0);
1327
    assert((line_size &7) ==0);
1328

    
1329
#define SUM(in0, in1, out0, out1) \
1330
      "movq (%0),%%mm2\n"\
1331
      "movq (%1)," #out0 "\n"\
1332
      "movq 8(%0),%%mm3\n"\
1333
      "movq 8(%1)," #out1 "\n"\
1334
      "add %3,%0\n"\
1335
      "add %3,%1\n"\
1336
      "psubb " #out0 ", %%mm2\n"\
1337
      "psubb " #out1 ", %%mm3\n"\
1338
      "pxor %%mm7, %%mm2\n"\
1339
      "pxor %%mm7, %%mm3\n"\
1340
      "movq %%mm2, " #out0 "\n"\
1341
      "movq %%mm3, " #out1 "\n"\
1342
      "psubusb " #in0 ", %%mm2\n"\
1343
      "psubusb " #in1 ", %%mm3\n"\
1344
      "psubusb " #out0 ", " #in0 "\n"\
1345
      "psubusb " #out1 ", " #in1 "\n"\
1346
      "por %%mm2, " #in0 "\n"\
1347
      "por %%mm3, " #in1 "\n"\
1348
      "movq " #in0 ", %%mm2\n"\
1349
      "movq " #in1 ", %%mm3\n"\
1350
      "punpcklbw %%mm7, " #in0 "\n"\
1351
      "punpcklbw %%mm7, " #in1 "\n"\
1352
      "punpckhbw %%mm7, %%mm2\n"\
1353
      "punpckhbw %%mm7, %%mm3\n"\
1354
      "paddw " #in1 ", " #in0 "\n"\
1355
      "paddw %%mm3, %%mm2\n"\
1356
      "paddw %%mm2, " #in0 "\n"\
1357
      "paddw " #in0 ", %%mm6\n"
1358

    
1359

    
1360
  asm volatile (
1361
      "movl %4,%%ecx\n"
1362
      "pxor %%mm6,%%mm6\n"
1363
      "pcmpeqw %%mm7,%%mm7\n"
1364
      "psllw $15, %%mm7\n"
1365
      "packsswb %%mm7, %%mm7\n"
1366
      "movq (%0),%%mm0\n"
1367
      "movq (%1),%%mm2\n"
1368
      "movq 8(%0),%%mm1\n"
1369
      "movq 8(%1),%%mm3\n"
1370
      "add %3,%0\n"
1371
      "add %3,%1\n"
1372
      "subl $2, %%ecx\n"
1373
      "psubb %%mm2, %%mm0\n"
1374
      "psubb %%mm3, %%mm1\n"
1375
      "pxor %%mm7, %%mm0\n"
1376
      "pxor %%mm7, %%mm1\n"
1377
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1378
      "1:\n"
1379

    
1380
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1381

    
1382
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1383

    
1384
      "subl $2, %%ecx\n"
1385
      "jnz 1b\n"
1386

    
1387
      "movq %%mm6,%%mm0\n"
1388
      "psrlq $32, %%mm6\n"
1389
      "paddw %%mm6,%%mm0\n"
1390
      "movq %%mm0,%%mm6\n"
1391
      "psrlq $16, %%mm0\n"
1392
      "paddw %%mm6,%%mm0\n"
1393
      "movd %%mm0,%2\n"
1394
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1395
      : "r" ((long)line_size) , "m" (h)
1396
      : "%ecx");
1397
    return tmp & 0x7FFF;
1398
}
1399
#undef SUM
1400

    
1401
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1402
    int tmp;
1403

    
1404
    assert( (((int)pix1) & 7) == 0);
1405
    assert( (((int)pix2) & 7) == 0);
1406
    assert((line_size &7) ==0);
1407

    
1408
#define SUM(in0, in1, out0, out1) \
1409
      "movq (%0)," #out0 "\n"\
1410
      "movq (%1),%%mm2\n"\
1411
      "movq 8(%0)," #out1 "\n"\
1412
      "movq 8(%1),%%mm3\n"\
1413
      "add %3,%0\n"\
1414
      "add %3,%1\n"\
1415
      "psubb %%mm2, " #out0 "\n"\
1416
      "psubb %%mm3, " #out1 "\n"\
1417
      "pxor %%mm7, " #out0 "\n"\
1418
      "pxor %%mm7, " #out1 "\n"\
1419
      "psadbw " #out0 ", " #in0 "\n"\
1420
      "psadbw " #out1 ", " #in1 "\n"\
1421
      "paddw " #in1 ", " #in0 "\n"\
1422
      "paddw " #in0 ", %%mm6\n"
1423

    
1424
  asm volatile (
1425
      "movl %4,%%ecx\n"
1426
      "pxor %%mm6,%%mm6\n"
1427
      "pcmpeqw %%mm7,%%mm7\n"
1428
      "psllw $15, %%mm7\n"
1429
      "packsswb %%mm7, %%mm7\n"
1430
      "movq (%0),%%mm0\n"
1431
      "movq (%1),%%mm2\n"
1432
      "movq 8(%0),%%mm1\n"
1433
      "movq 8(%1),%%mm3\n"
1434
      "add %3,%0\n"
1435
      "add %3,%1\n"
1436
      "subl $2, %%ecx\n"
1437
      "psubb %%mm2, %%mm0\n"
1438
      "psubb %%mm3, %%mm1\n"
1439
      "pxor %%mm7, %%mm0\n"
1440
      "pxor %%mm7, %%mm1\n"
1441
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1442
      "1:\n"
1443

    
1444
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1445

    
1446
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1447

    
1448
      "subl $2, %%ecx\n"
1449
      "jnz 1b\n"
1450

    
1451
      "movd %%mm6,%2\n"
1452
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1453
      : "r" ((long)line_size) , "m" (h)
1454
      : "%ecx");
1455
    return tmp;
1456
}
1457
#undef SUM
1458

    
1459
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1460
    long i=0;
1461
    asm volatile(
1462
        "1:                             \n\t"
1463
        "movq  (%2, %0), %%mm0          \n\t"
1464
        "movq  (%1, %0), %%mm1          \n\t"
1465
        "psubb %%mm0, %%mm1             \n\t"
1466
        "movq %%mm1, (%3, %0)           \n\t"
1467
        "movq 8(%2, %0), %%mm0          \n\t"
1468
        "movq 8(%1, %0), %%mm1          \n\t"
1469
        "psubb %%mm0, %%mm1             \n\t"
1470
        "movq %%mm1, 8(%3, %0)          \n\t"
1471
        "add $16, %0                    \n\t"
1472
        "cmp %4, %0                     \n\t"
1473
        " jb 1b                         \n\t"
1474
        : "+r" (i)
1475
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1476
    );
1477
    for(; i<w; i++)
1478
        dst[i+0] = src1[i+0]-src2[i+0];
1479
}
1480

    
1481
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1482
    long i=0;
1483
    uint8_t l, lt;
1484

    
1485
    asm volatile(
1486
        "1:                             \n\t"
1487
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1488
        "movq  (%1, %0), %%mm1          \n\t" // T
1489
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1490
        "movq  (%2, %0), %%mm3          \n\t" // X
1491
        "movq %%mm2, %%mm4              \n\t" // L
1492
        "psubb %%mm0, %%mm2             \n\t"
1493
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1494
        "movq %%mm4, %%mm5              \n\t" // L
1495
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1496
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1497
        "pminub %%mm2, %%mm4            \n\t"
1498
        "pmaxub %%mm1, %%mm4            \n\t"
1499
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1500
        "movq %%mm3, (%3, %0)           \n\t"
1501
        "add $8, %0                     \n\t"
1502
        "cmp %4, %0                     \n\t"
1503
        " jb 1b                         \n\t"
1504
        : "+r" (i)
1505
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1506
    );
1507

    
1508
    l= *left;
1509
    lt= *left_top;
1510

    
1511
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1512

    
1513
    *left_top= src1[w-1];
1514
    *left    = src2[w-1];
1515
}
1516

    
1517
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1518
    "mov"#m" "#p1", "#a"              \n\t"\
1519
    "mov"#m" "#p2", "#t"              \n\t"\
1520
    "punpcklbw "#a", "#t"             \n\t"\
1521
    "punpcklbw "#a", "#a"             \n\t"\
1522
    "psubw     "#t", "#a"             \n\t"\
1523

    
1524
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1525
    uint8_t *p1b=p1, *p2b=p2;\
1526
    asm volatile(\
1527
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1528
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1529
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1530
        "add %4, %1                   \n\t"\
1531
        "add %4, %2                   \n\t"\
1532
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1533
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1534
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1535
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1536
        "mov"#m1" "#mm"0, %0          \n\t"\
1537
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1538
        "mov"#m1" %0, "#mm"0          \n\t"\
1539
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1540
        : "r"((long)stride), "r"((long)stride*3)\
1541
    );\
1542
}
1543
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1544

    
1545
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1546
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1547

    
1548
#ifdef ARCH_X86_64
1549
// permutes 01234567 -> 05736421
1550
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1551
    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1552
    SBUTTERFLY(c,d,b,wd,dqa)\
1553
    SBUTTERFLY(e,f,d,wd,dqa)\
1554
    SBUTTERFLY(g,h,f,wd,dqa)\
1555
    SBUTTERFLY(a,c,h,dq,dqa)\
1556
    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1557
    SBUTTERFLY(e,g,b,dq,dqa)\
1558
    SBUTTERFLY(d,f,g,dq,dqa)\
1559
    SBUTTERFLY(a,e,f,qdq,dqa)\
1560
    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1561
    SBUTTERFLY(h,b,d,qdq,dqa)\
1562
    SBUTTERFLY(c,g,b,qdq,dqa)\
1563
    "movdqa %%xmm8, "#g"              \n\t"
1564
#else
1565
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1566
    "movdqa "#h", "#t"                \n\t"\
1567
    SBUTTERFLY(a,b,h,wd,dqa)\
1568
    "movdqa "#h", 16"#t"              \n\t"\
1569
    "movdqa "#t", "#h"                \n\t"\
1570
    SBUTTERFLY(c,d,b,wd,dqa)\
1571
    SBUTTERFLY(e,f,d,wd,dqa)\
1572
    SBUTTERFLY(g,h,f,wd,dqa)\
1573
    SBUTTERFLY(a,c,h,dq,dqa)\
1574
    "movdqa "#h", "#t"                \n\t"\
1575
    "movdqa 16"#t", "#h"              \n\t"\
1576
    SBUTTERFLY(h,b,c,dq,dqa)\
1577
    SBUTTERFLY(e,g,b,dq,dqa)\
1578
    SBUTTERFLY(d,f,g,dq,dqa)\
1579
    SBUTTERFLY(a,e,f,qdq,dqa)\
1580
    SBUTTERFLY(h,d,e,qdq,dqa)\
1581
    "movdqa "#h", 16"#t"              \n\t"\
1582
    "movdqa "#t", "#h"                \n\t"\
1583
    SBUTTERFLY(h,b,d,qdq,dqa)\
1584
    SBUTTERFLY(c,g,b,qdq,dqa)\
1585
    "movdqa 16"#t", "#g"              \n\t"
1586
#endif
1587

    
1588
#define LBUTTERFLY2(a1,b1,a2,b2)\
1589
    "paddw " #b1 ", " #a1 "           \n\t"\
1590
    "paddw " #b2 ", " #a2 "           \n\t"\
1591
    "paddw " #b1 ", " #b1 "           \n\t"\
1592
    "paddw " #b2 ", " #b2 "           \n\t"\
1593
    "psubw " #a1 ", " #b1 "           \n\t"\
1594
    "psubw " #a2 ", " #b2 "           \n\t"
1595

    
1596
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1597
        LBUTTERFLY2(m0, m1, m2, m3)\
1598
        LBUTTERFLY2(m4, m5, m6, m7)\
1599
        LBUTTERFLY2(m0, m2, m1, m3)\
1600
        LBUTTERFLY2(m4, m6, m5, m7)\
1601
        LBUTTERFLY2(m0, m4, m1, m5)\
1602
        LBUTTERFLY2(m2, m6, m3, m7)\
1603

    
1604
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1605

    
1606
#define MMABS_MMX(a,z)\
1607
    "pxor " #z ", " #z "              \n\t"\
1608
    "pcmpgtw " #a ", " #z "           \n\t"\
1609
    "pxor " #z ", " #a "              \n\t"\
1610
    "psubw " #z ", " #a "             \n\t"
1611

    
1612
#define MMABS_MMX2(a,z)\
1613
    "pxor " #z ", " #z "              \n\t"\
1614
    "psubw " #a ", " #z "             \n\t"\
1615
    "pmaxsw " #z ", " #a "            \n\t"
1616

    
1617
#define MMABS_SSSE3(a,z)\
1618
    "pabsw " #a ", " #a "             \n\t"
1619

    
1620
#define MMABS_SUM(a,z, sum)\
1621
    MMABS(a,z)\
1622
    "paddusw " #a ", " #sum "         \n\t"
1623

    
1624
#define MMABS_SUM_8x8_NOSPILL\
1625
    MMABS(%%xmm0, %%xmm8)\
1626
    MMABS(%%xmm1, %%xmm9)\
1627
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1628
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1629
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1630
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1631
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1632
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1633
    "paddusw %%xmm1, %%xmm0           \n\t"
1634

    
1635
#ifdef ARCH_X86_64
1636
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1637
#else
1638
#define MMABS_SUM_8x8_SSE2\
1639
    "movdqa %%xmm7, (%1)              \n\t"\
1640
    MMABS(%%xmm0, %%xmm7)\
1641
    MMABS(%%xmm1, %%xmm7)\
1642
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1643
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1644
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1645
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1646
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1647
    "movdqa (%1), %%xmm2              \n\t"\
1648
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1649
    "paddusw %%xmm1, %%xmm0           \n\t"
1650
#endif
1651

    
1652
#define LOAD4(o, a, b, c, d)\
1653
    "movq "#o"(%1),    "#a"           \n\t"\
1654
    "movq "#o"+8(%1),  "#b"           \n\t"\
1655
    "movq "#o"+16(%1), "#c"           \n\t"\
1656
    "movq "#o"+24(%1), "#d"           \n\t"\
1657

    
1658
#define STORE4(o, a, b, c, d)\
1659
    "movq "#a", "#o"(%1)              \n\t"\
1660
    "movq "#b", "#o"+8(%1)            \n\t"\
1661
    "movq "#c", "#o"+16(%1)           \n\t"\
1662
    "movq "#d", "#o"+24(%1)           \n\t"\
1663

    
1664
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1665
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1666
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1667
#define HSUM_MMX(a, t, dst)\
1668
    "movq "#a", "#t"                  \n\t"\
1669
    "psrlq $32, "#a"                  \n\t"\
1670
    "paddusw "#t", "#a"               \n\t"\
1671
    "movq "#a", "#t"                  \n\t"\
1672
    "psrlq $16, "#a"                  \n\t"\
1673
    "paddusw "#t", "#a"               \n\t"\
1674
    "movd "#a", "#dst"                \n\t"\
1675

    
1676
#define HSUM_MMX2(a, t, dst)\
1677
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1678
    "paddusw "#t", "#a"               \n\t"\
1679
    "pshufw $0x01, "#a", "#t"         \n\t"\
1680
    "paddusw "#t", "#a"               \n\t"\
1681
    "movd "#a", "#dst"                \n\t"\
1682

    
1683
#define HSUM_SSE2(a, t, dst)\
1684
    "movhlps "#a", "#t"               \n\t"\
1685
    "paddusw "#t", "#a"               \n\t"\
1686
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1687
    "paddusw "#t", "#a"               \n\t"\
1688
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1689
    "paddusw "#t", "#a"               \n\t"\
1690
    "movd "#a", "#dst"                \n\t"\
1691

    
1692
#define HADAMARD8_DIFF_MMX(cpu) \
1693
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1694
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1695
    int sum;\
1696
\
1697
    assert(h==8);\
1698
\
1699
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1700
\
1701
    asm volatile(\
1702
        HADAMARD48\
1703
\
1704
        "movq %%mm7, 96(%1)             \n\t"\
1705
\
1706
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1707
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1708
\
1709
        "movq 96(%1), %%mm7             \n\t"\
1710
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1711
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1712
\
1713
        : "=r" (sum)\
1714
        : "r"(temp)\
1715
    );\
1716
\
1717
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1718
\
1719
    asm volatile(\
1720
        HADAMARD48\
1721
\
1722
        "movq %%mm7, 96(%1)             \n\t"\
1723
\
1724
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1725
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1726
\
1727
        "movq 96(%1), %%mm7             \n\t"\
1728
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1729
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1730
        "movq %%mm6, %%mm7              \n\t"\
1731
        "movq %%mm0, %%mm6              \n\t"\
1732
\
1733
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1734
\
1735
        HADAMARD48\
1736
        "movq %%mm7, 64(%1)             \n\t"\
1737
        MMABS(%%mm0, %%mm7)\
1738
        MMABS(%%mm1, %%mm7)\
1739
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1740
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1741
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1742
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1743
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1744
        "movq 64(%1), %%mm2             \n\t"\
1745
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1746
        "paddusw %%mm1, %%mm0           \n\t"\
1747
        "movq %%mm0, 64(%1)             \n\t"\
1748
\
1749
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1750
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1751
\
1752
        HADAMARD48\
1753
        "movq %%mm7, (%1)               \n\t"\
1754
        MMABS(%%mm0, %%mm7)\
1755
        MMABS(%%mm1, %%mm7)\
1756
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1757
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1758
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1759
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1760
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1761
        "movq (%1), %%mm2               \n\t"\
1762
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1763
        "paddusw 64(%1), %%mm0          \n\t"\
1764
        "paddusw %%mm1, %%mm0           \n\t"\
1765
\
1766
        HSUM(%%mm0, %%mm1, %0)\
1767
\
1768
        : "=r" (sum)\
1769
        : "r"(temp)\
1770
    );\
1771
    return sum&0xFFFF;\
1772
}\
1773
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1774

    
1775
#define HADAMARD8_DIFF_SSE2(cpu) \
1776
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1777
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1778
    int sum;\
1779
\
1780
    assert(h==8);\
1781
\
1782
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1783
\
1784
    asm volatile(\
1785
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1786
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1787
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1788
        MMABS_SUM_8x8\
1789
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1790
        : "=r" (sum)\
1791
        : "r"(temp)\
1792
    );\
1793
    return sum&0xFFFF;\
1794
}\
1795
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1796

    
1797
#define MMABS(a,z)         MMABS_MMX(a,z)
1798
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1799
HADAMARD8_DIFF_MMX(mmx)
1800
#undef MMABS
1801
#undef HSUM
1802

    
1803
#define MMABS(a,z)         MMABS_MMX2(a,z)
1804
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1805
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1806
HADAMARD8_DIFF_MMX(mmx2)
1807
HADAMARD8_DIFF_SSE2(sse2)
1808
#undef MMABS
1809
#undef MMABS_SUM_8x8
1810
#undef HSUM
1811

    
1812
#ifdef HAVE_SSSE3
1813
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1814
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1815
HADAMARD8_DIFF_SSE2(ssse3)
1816
#undef MMABS
1817
#undef MMABS_SUM_8x8
1818
#endif
1819

    
1820
#define DCT_SAD4(m,mm,o)\
1821
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1822
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1823
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1824
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1825
    MMABS_SUM(mm##2, mm##6, mm##0)\
1826
    MMABS_SUM(mm##3, mm##7, mm##1)\
1827
    MMABS_SUM(mm##4, mm##6, mm##0)\
1828
    MMABS_SUM(mm##5, mm##7, mm##1)\
1829

    
1830
#define DCT_SAD_MMX\
1831
    "pxor %%mm0, %%mm0                \n\t"\
1832
    "pxor %%mm1, %%mm1                \n\t"\
1833
    DCT_SAD4(q, %%mm, 0)\
1834
    DCT_SAD4(q, %%mm, 8)\
1835
    DCT_SAD4(q, %%mm, 64)\
1836
    DCT_SAD4(q, %%mm, 72)\
1837
    "paddusw %%mm1, %%mm0             \n\t"\
1838
    HSUM(%%mm0, %%mm1, %0)
1839

    
1840
#define DCT_SAD_SSE2\
1841
    "pxor %%xmm0, %%xmm0              \n\t"\
1842
    "pxor %%xmm1, %%xmm1              \n\t"\
1843
    DCT_SAD4(dqa, %%xmm, 0)\
1844
    DCT_SAD4(dqa, %%xmm, 64)\
1845
    "paddusw %%xmm1, %%xmm0           \n\t"\
1846
    HSUM(%%xmm0, %%xmm1, %0)
1847

    
1848
#define DCT_SAD_FUNC(cpu) \
1849
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1850
    int sum;\
1851
    asm volatile(\
1852
        DCT_SAD\
1853
        :"=r"(sum)\
1854
        :"r"(block)\
1855
    );\
1856
    return sum&0xFFFF;\
1857
}
1858

    
1859
#define DCT_SAD       DCT_SAD_MMX
1860
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1861
#define MMABS(a,z)    MMABS_MMX(a,z)
1862
DCT_SAD_FUNC(mmx)
1863
#undef MMABS
1864
#undef HSUM
1865

    
1866
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1867
#define MMABS(a,z)    MMABS_MMX2(a,z)
1868
DCT_SAD_FUNC(mmx2)
1869
#undef HSUM
1870
#undef DCT_SAD
1871

    
1872
#define DCT_SAD       DCT_SAD_SSE2
1873
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1874
DCT_SAD_FUNC(sse2)
1875
#undef MMABS
1876

    
1877
#ifdef HAVE_SSSE3
1878
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1879
DCT_SAD_FUNC(ssse3)
1880
#undef MMABS
1881
#endif
1882
#undef HSUM
1883
#undef DCT_SAD
1884

    
1885
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1886
    int sum;
1887
    long i=size;
1888
    asm volatile(
1889
        "pxor %%mm4, %%mm4 \n"
1890
        "1: \n"
1891
        "sub $8, %0 \n"
1892
        "movq (%2,%0), %%mm2 \n"
1893
        "movq (%3,%0,2), %%mm0 \n"
1894
        "movq 8(%3,%0,2), %%mm1 \n"
1895
        "punpckhbw %%mm2, %%mm3 \n"
1896
        "punpcklbw %%mm2, %%mm2 \n"
1897
        "psraw $8, %%mm3 \n"
1898
        "psraw $8, %%mm2 \n"
1899
        "psubw %%mm3, %%mm1 \n"
1900
        "psubw %%mm2, %%mm0 \n"
1901
        "pmaddwd %%mm1, %%mm1 \n"
1902
        "pmaddwd %%mm0, %%mm0 \n"
1903
        "paddd %%mm1, %%mm4 \n"
1904
        "paddd %%mm0, %%mm4 \n"
1905
        "jg 1b \n"
1906
        "movq %%mm4, %%mm3 \n"
1907
        "psrlq $32, %%mm3 \n"
1908
        "paddd %%mm3, %%mm4 \n"
1909
        "movd %%mm4, %1 \n"
1910
        :"+r"(i), "=r"(sum)
1911
        :"r"(pix1), "r"(pix2)
1912
    );
1913
    return sum;
1914
}
1915

    
1916
#endif //CONFIG_ENCODERS
1917

    
1918
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1919
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1920

    
1921
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1922
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1923
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1924
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1925
        "movq "#in7", " #m3 "             \n\t" /* d */\
1926
        "movq "#in0", %%mm5               \n\t" /* D */\
1927
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1928
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1929
        "movq "#in1", %%mm5               \n\t" /* C */\
1930
        "movq "#in2", %%mm6               \n\t" /* B */\
1931
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1932
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1933
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1934
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1935
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1936
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1937
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1938
        "psraw $5, %%mm5                  \n\t"\
1939
        "packuswb %%mm5, %%mm5            \n\t"\
1940
        OP(%%mm5, out, %%mm7, d)
1941

    
1942
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1943
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1944
    uint64_t temp;\
1945
\
1946
    asm volatile(\
1947
        "pxor %%mm7, %%mm7                \n\t"\
1948
        "1:                               \n\t"\
1949
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1950
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1951
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1952
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1953
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1954
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1955
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1956
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1957
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1958
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1959
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1960
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1961
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1962
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1963
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1964
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1965
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1966
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1967
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1968
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1969
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1970
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1971
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1972
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1973
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1974
        "paddw %6, %%mm6                  \n\t"\
1975
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1976
        "psraw $5, %%mm0                  \n\t"\
1977
        "movq %%mm0, %5                   \n\t"\
1978
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1979
        \
1980
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1981
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1982
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1983
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1984
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1985
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1986
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1987
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1988
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1989
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1990
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1991
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1992
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1993
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1994
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1995
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1996
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1997
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1998
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1999
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
2000
        "paddw %6, %%mm1                  \n\t"\
2001
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
2002
        "psraw $5, %%mm3                  \n\t"\
2003
        "movq %5, %%mm1                   \n\t"\
2004
        "packuswb %%mm3, %%mm1            \n\t"\
2005
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
2006
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2007
        \
2008
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
2009
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
2010
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
2011
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
2012
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
2013
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
2014
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
2015
        "paddw %%mm1, %%mm5               \n\t" /* b */\
2016
        "paddw %%mm4, %%mm0               \n\t" /* c */\
2017
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2018
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
2019
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
2020
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
2021
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
2022
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
2023
        "paddw %%mm3, %%mm2               \n\t" /* d */\
2024
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
2025
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
2026
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
2027
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
2028
        "paddw %%mm2, %%mm6               \n\t" /* a */\
2029
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2030
        "paddw %6, %%mm0                  \n\t"\
2031
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2032
        "psraw $5, %%mm0                  \n\t"\
2033
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2034
        \
2035
        "paddw %%mm5, %%mm3               \n\t" /* a */\
2036
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
2037
        "paddw %%mm4, %%mm6               \n\t" /* b */\
2038
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
2039
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
2040
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2041
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2042
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2043
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2044
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2045
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2046
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2047
        "paddw %6, %%mm4                  \n\t"\
2048
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2049
        "psraw $5, %%mm4                  \n\t"\
2050
        "packuswb %%mm4, %%mm0            \n\t"\
2051
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2052
        \
2053
        "add %3, %0                       \n\t"\
2054
        "add %4, %1                       \n\t"\
2055
        "decl %2                          \n\t"\
2056
        " jnz 1b                          \n\t"\
2057
        : "+a"(src), "+c"(dst), "+m"(h)\
2058
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2059
        : "memory"\
2060
    );\
2061
}\
2062
\
2063
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2064
    int i;\
2065
    int16_t temp[16];\
2066
    /* quick HACK, XXX FIXME MUST be optimized */\
2067
    for(i=0; i<h; i++)\
2068
    {\
2069
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2070
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2071
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2072
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2073
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2074
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2075
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2076
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2077
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2078
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2079
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2080
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2081
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2082
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2083
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2084
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2085
        asm volatile(\
2086
            "movq (%0), %%mm0               \n\t"\
2087
            "movq 8(%0), %%mm1              \n\t"\
2088
            "paddw %2, %%mm0                \n\t"\
2089
            "paddw %2, %%mm1                \n\t"\
2090
            "psraw $5, %%mm0                \n\t"\
2091
            "psraw $5, %%mm1                \n\t"\
2092
            "packuswb %%mm1, %%mm0          \n\t"\
2093
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2094
            "movq 16(%0), %%mm0             \n\t"\
2095
            "movq 24(%0), %%mm1             \n\t"\
2096
            "paddw %2, %%mm0                \n\t"\
2097
            "paddw %2, %%mm1                \n\t"\
2098
            "psraw $5, %%mm0                \n\t"\
2099
            "psraw $5, %%mm1                \n\t"\
2100
            "packuswb %%mm1, %%mm0          \n\t"\
2101
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2102
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2103
            : "memory"\
2104
        );\
2105
        dst+=dstStride;\
2106
        src+=srcStride;\
2107
    }\
2108
}\
2109
\
2110
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2111
    uint64_t temp;\
2112
\
2113
    asm volatile(\
2114
        "pxor %%mm7, %%mm7                \n\t"\
2115
        "1:                               \n\t"\
2116
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2117
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2118
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2119
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2120
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2121
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2122
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2123
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2124
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2125
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2126
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2127
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2128
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2129
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2130
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2131
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2132
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2133
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2134
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2135
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2136
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2137
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2138
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2139
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2140
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2141
        "paddw %6, %%mm6                  \n\t"\
2142
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2143
        "psraw $5, %%mm0                  \n\t"\
2144
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2145
        \
2146
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2147
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2148
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2149
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2150
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2151
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2152
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2153
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2154
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2155
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2156
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2157
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2158
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2159
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2160
        "paddw %6, %%mm1                  \n\t"\
2161
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2162
        "psraw $5, %%mm3                  \n\t"\
2163
        "packuswb %%mm3, %%mm0            \n\t"\
2164
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2165
        \
2166
        "add %3, %0                       \n\t"\
2167
        "add %4, %1                       \n\t"\
2168
        "decl %2                          \n\t"\
2169
        " jnz 1b                          \n\t"\
2170
        : "+a"(src), "+c"(dst), "+m"(h)\
2171
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2172
        : "memory"\
2173
    );\
2174
}\
2175
\
2176
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2177
    int i;\
2178
    int16_t temp[8];\
2179
    /* quick HACK, XXX FIXME MUST be optimized */\
2180
    for(i=0; i<h; i++)\
2181
    {\
2182
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2183
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2184
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2185
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2186
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2187
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2188
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2189
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2190
        asm volatile(\
2191
            "movq (%0), %%mm0           \n\t"\
2192
            "movq 8(%0), %%mm1          \n\t"\
2193
            "paddw %2, %%mm0            \n\t"\
2194
            "paddw %2, %%mm1            \n\t"\
2195
            "psraw $5, %%mm0            \n\t"\
2196
            "psraw $5, %%mm1            \n\t"\
2197
            "packuswb %%mm1, %%mm0      \n\t"\
2198
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2199
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2200
            :"memory"\
2201
        );\
2202
        dst+=dstStride;\
2203
        src+=srcStride;\
2204
    }\
2205
}
2206

    
2207
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2208
\
2209
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2210
    uint64_t temp[17*4];\
2211
    uint64_t *temp_ptr= temp;\
2212
    int count= 17;\
2213
\
2214
    /*FIXME unroll */\
2215
    asm volatile(\
2216
        "pxor %%mm7, %%mm7              \n\t"\
2217
        "1:                             \n\t"\
2218
        "movq (%0), %%mm0               \n\t"\
2219
        "movq (%0), %%mm1               \n\t"\
2220
        "movq 8(%0), %%mm2              \n\t"\
2221
        "movq 8(%0), %%mm3              \n\t"\
2222
        "punpcklbw %%mm7, %%mm0         \n\t"\
2223
        "punpckhbw %%mm7, %%mm1         \n\t"\
2224
        "punpcklbw %%mm7, %%mm2         \n\t"\
2225
        "punpckhbw %%mm7, %%mm3         \n\t"\
2226
        "movq %%mm0, (%1)               \n\t"\
2227
        "movq %%mm1, 17*8(%1)           \n\t"\
2228
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2229
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2230
        "add $8, %1                     \n\t"\
2231
        "add %3, %0                     \n\t"\
2232
        "decl %2                        \n\t"\
2233
        " jnz 1b                        \n\t"\
2234
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2235
        : "r" ((long)srcStride)\
2236
        : "memory"\
2237
    );\
2238
    \
2239
    temp_ptr= temp;\
2240
    count=4;\
2241
    \
2242
/*FIXME reorder for speed */\
2243
    asm volatile(\
2244
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2245
        "1:                             \n\t"\
2246
        "movq (%0), %%mm0               \n\t"\
2247
        "movq 8(%0), %%mm1              \n\t"\
2248
        "movq 16(%0), %%mm2             \n\t"\
2249
        "movq 24(%0), %%mm3             \n\t"\
2250
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2251
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2252
        "add %4, %1                     \n\t"\
2253
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2254
        \
2255
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2256
        "add %4, %1                     \n\t"\
2257
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2258
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2259
        "add %4, %1                     \n\t"\
2260
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2261
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2262
        "add %4, %1                     \n\t"\
2263
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2264
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2265
        "add %4, %1                     \n\t"\
2266
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2267
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2268
        "add %4, %1                     \n\t"\
2269
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2270
        \
2271
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2272
        "add %4, %1                     \n\t"  \
2273
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2274
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2275
        \
2276
        "add $136, %0                   \n\t"\
2277
        "add %6, %1                     \n\t"\
2278
        "decl %2                        \n\t"\
2279
        " jnz 1b                        \n\t"\
2280
        \
2281
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2282
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2283
        :"memory"\
2284
    );\
2285
}\
2286
\
2287
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2288
    uint64_t temp[9*2];\
2289
    uint64_t *temp_ptr= temp;\
2290
    int count= 9;\
2291
\
2292
    /*FIXME unroll */\
2293
    asm volatile(\
2294
        "pxor %%mm7, %%mm7              \n\t"\
2295
        "1:                             \n\t"\
2296
        "movq (%0), %%mm0               \n\t"\
2297
        "movq (%0), %%mm1               \n\t"\
2298
        "punpcklbw %%mm7, %%mm0         \n\t"\
2299
        "punpckhbw %%mm7, %%mm1         \n\t"\
2300
        "movq %%mm0, (%1)               \n\t"\
2301
        "movq %%mm1, 9*8(%1)            \n\t"\
2302
        "add $8, %1                     \n\t"\
2303
        "add %3, %0                     \n\t"\
2304
        "decl %2                        \n\t"\
2305
        " jnz 1b                        \n\t"\
2306
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2307
        : "r" ((long)srcStride)\
2308
        : "memory"\
2309
    );\
2310
    \
2311
    temp_ptr= temp;\
2312
    count=2;\
2313
    \
2314
/*FIXME reorder for speed */\
2315
    asm volatile(\
2316
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2317
        "1:                             \n\t"\
2318
        "movq (%0), %%mm0               \n\t"\
2319
        "movq 8(%0), %%mm1              \n\t"\
2320
        "movq 16(%0), %%mm2             \n\t"\
2321
        "movq 24(%0), %%mm3             \n\t"\
2322
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2323
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2324
        "add %4, %1                     \n\t"\
2325
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2326
        \
2327
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2328
        "add %4, %1                     \n\t"\
2329
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2330
        \
2331
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2332
        "add %4, %1                     \n\t"\
2333
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2334
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2335
                \
2336
        "add $72, %0                    \n\t"\
2337
        "add %6, %1                     \n\t"\
2338
        "decl %2                        \n\t"\
2339
        " jnz 1b                        \n\t"\
2340
         \
2341
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2342
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2343
        : "memory"\
2344
   );\
2345
}\
2346
\
2347
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2348
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2349
}\
2350
\
2351
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2352
    uint64_t temp[8];\
2353
    uint8_t * const half= (uint8_t*)temp;\
2354
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2355
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2356
}\
2357
\
2358
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2359
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2360
}\
2361
\
2362
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2363
    uint64_t temp[8];\
2364
    uint8_t * const half= (uint8_t*)temp;\
2365
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2366
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2367
}\
2368
\
2369
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2370
    uint64_t temp[8];\
2371
    uint8_t * const half= (uint8_t*)temp;\
2372
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2373
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2374
}\
2375
\
2376
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2377
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2378
}\
2379
\
2380
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2381
    uint64_t temp[8];\
2382
    uint8_t * const half= (uint8_t*)temp;\
2383
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2384
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2385
}\
2386
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2387
    uint64_t half[8 + 9];\
2388
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2389
    uint8_t * const halfHV= ((uint8_t*)half);\
2390
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2391
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2392
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2393
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2394
}\
2395
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2396
    uint64_t half[8 + 9];\
2397
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2398
    uint8_t * const halfHV= ((uint8_t*)half);\
2399
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2400
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2401
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2402
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2403
}\
2404
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2405
    uint64_t half[8 + 9];\
2406
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2407
    uint8_t * const halfHV= ((uint8_t*)half);\
2408
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2409
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2410
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2411
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2412
}\
2413
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2414
    uint64_t half[8 + 9];\
2415
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2416
    uint8_t * const halfHV= ((uint8_t*)half);\
2417
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2418
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2419
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2420
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2421
}\
2422
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2423
    uint64_t half[8 + 9];\
2424
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2425
    uint8_t * const halfHV= ((uint8_t*)half);\
2426
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2427
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2428
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2429
}\
2430
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2431
    uint64_t half[8 + 9];\
2432
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2433
    uint8_t * const halfHV= ((uint8_t*)half);\
2434
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2435
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2436
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2437
}\
2438
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2439
    uint64_t half[8 + 9];\
2440
    uint8_t * const halfH= ((uint8_t*)half);\
2441
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2442
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2443
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2444
}\
2445
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2446
    uint64_t half[8 + 9];\
2447
    uint8_t * const halfH= ((uint8_t*)half);\
2448
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2449
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2450
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2451
}\
2452
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2453
    uint64_t half[9];\
2454
    uint8_t * const halfH= ((uint8_t*)half);\
2455
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2456
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2457
}\
2458
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2459
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2460
}\
2461
\
2462
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2463
    uint64_t temp[32];\
2464
    uint8_t * const half= (uint8_t*)temp;\
2465
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2466
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2467
}\
2468
\
2469
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2470
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2471
}\
2472
\
2473
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2474
    uint64_t temp[32];\
2475
    uint8_t * const half= (uint8_t*)temp;\
2476
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2477
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2478
}\
2479
\
2480
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2481
    uint64_t temp[32];\
2482
    uint8_t * const half= (uint8_t*)temp;\
2483
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2484
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2485
}\
2486
\
2487
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2488
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2489
}\
2490
\
2491
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2492
    uint64_t temp[32];\
2493
    uint8_t * const half= (uint8_t*)temp;\
2494
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2495
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2496
}\
2497
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2498
    uint64_t half[16*2 + 17*2];\
2499
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2500
    uint8_t * const halfHV= ((uint8_t*)half);\
2501
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2502
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2503
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2504
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2505
}\
2506
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2507
    uint64_t half[16*2 + 17*2];\
2508
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2509
    uint8_t * const halfHV= ((uint8_t*)half);\
2510
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2511
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2512
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2513
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2514
}\
2515
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2516
    uint64_t half[16*2 + 17*2];\
2517
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2518
    uint8_t * const halfHV= ((uint8_t*)half);\
2519
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2520
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2521
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2522
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2523
}\
2524
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2525
    uint64_t half[16*2 + 17*2];\
2526
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2527
    uint8_t * const halfHV= ((uint8_t*)half);\
2528
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2529
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2530
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2531
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2532
}\
2533
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2534
    uint64_t half[16*2 + 17*2];\
2535
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2536
    uint8_t * const halfHV= ((uint8_t*)half);\
2537
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2538
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2539
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2540
}\
2541
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2542
    uint64_t half[16*2 + 17*2];\
2543
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2544
    uint8_t * const halfHV= ((uint8_t*)half);\
2545
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2546
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2547
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2548
}\
2549
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2550
    uint64_t half[17*2];\
2551
    uint8_t * const halfH= ((uint8_t*)half);\
2552
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2553
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2554
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2555
}\
2556
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2557
    uint64_t half[17*2];\
2558
    uint8_t * const halfH= ((uint8_t*)half);\
2559
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2560
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2561
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2562
}\
2563
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2564
    uint64_t half[17*2];\
2565
    uint8_t * const halfH= ((uint8_t*)half);\
2566
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2567
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2568
}
2569

    
2570
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2571
#define AVG_3DNOW_OP(a,b,temp, size) \
2572
"mov" #size " " #b ", " #temp "   \n\t"\
2573
"pavgusb " #temp ", " #a "        \n\t"\
2574
"mov" #size " " #a ", " #b "      \n\t"
2575
#define AVG_MMX2_OP(a,b,temp, size) \
2576
"mov" #size " " #b ", " #temp "   \n\t"\
2577
"pavgb " #temp ", " #a "          \n\t"\
2578
"mov" #size " " #a ", " #b "      \n\t"
2579

    
2580
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2581
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2582
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2583
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2584
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2585
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2586
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2587
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2588
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2589

    
2590
/***********************************/
2591
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2592

    
2593
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2594
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2595
    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2596
}
2597
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2598
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2599
    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2600
}
2601

    
2602
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
2603
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2604
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2605
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2606
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2607
                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2608
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2609
                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2610
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2611
                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2612
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2613
    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2614
}\
2615
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2616
    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2617
}\
2618
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
2619
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
2620
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
2621
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
2622
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
2623
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
2624
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
2625
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2626

    
2627
QPEL_2TAP(put_, 16, mmx2)
2628
QPEL_2TAP(avg_, 16, mmx2)
2629
QPEL_2TAP(put_,  8, mmx2)
2630
QPEL_2TAP(avg_,  8, mmx2)
2631
QPEL_2TAP(put_, 16, 3dnow)
2632
QPEL_2TAP(avg_, 16, 3dnow)
2633
QPEL_2TAP(put_,  8, 3dnow)
2634
QPEL_2TAP(avg_,  8, 3dnow)
2635

    
2636

    
2637
#if 0
2638
static void just_return() { return; }
2639
#endif
2640

    
2641
#define SET_QPEL_FUNC(postfix1, postfix2) \
2642
    c->put_ ## postfix1 = put_ ## postfix2;\
2643
    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2644
    c->avg_ ## postfix1 = avg_ ## postfix2;
2645

    
2646
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2647
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2648
    const int w = 8;
2649
    const int ix = ox>>(16+shift);
2650
    const int iy = oy>>(16+shift);
2651
    const int oxs = ox>>4;
2652
    const int oys = oy>>4;
2653
    const int dxxs = dxx>>4;
2654
    const int dxys = dxy>>4;
2655
    const int dyxs = dyx>>4;
2656
    const int dyys = dyy>>4;
2657
    const uint16_t r4[4] = {r,r,r,r};
2658
    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2659
    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2660
    const uint64_t shift2 = 2*shift;
2661
    uint8_t edge_buf[(h+1)*stride];
2662
    int x, y;
2663

    
2664
    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2665
    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2666
    const int dxh = dxy*(h-1);
2667
    const int dyw = dyx*(w-1);
2668
    if( // non-constant fullpel offset (3% of blocks)
2669
        (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2670
         oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2671
        // uses more than 16 bits of subpel mv (only at huge resolution)
2672
        || (dxx|dxy|dyx|dyy)&15 )
2673
    {
2674
        //FIXME could still use mmx for some of the rows
2675
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2676
        return;
2677
    }
2678

    
2679
    src += ix + iy*stride;
2680
    if( (unsigned)ix >= width-w ||
2681
        (unsigned)iy >= height-h )
2682
    {
2683
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2684
        src = edge_buf;
2685
    }
2686

    
2687
    asm volatile(
2688
        "movd         %0, %%mm6 \n\t"
2689
        "pxor      %%mm7, %%mm7 \n\t"
2690
        "punpcklwd %%mm6, %%mm6 \n\t"
2691
        "punpcklwd %%mm6, %%mm6 \n\t"
2692
        :: "r"(1<<shift)
2693
    );
2694

    
2695
    for(x=0; x<w; x+=4){
2696
        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2697
                            oxs - dxys + dxxs*(x+1),
2698
                            oxs - dxys + dxxs*(x+2),
2699
                            oxs - dxys + dxxs*(x+3) };
2700
        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2701
                            oys - dyys + dyxs*(x+1),
2702
                            oys - dyys + dyxs*(x+2),
2703
                            oys - dyys + dyxs*(x+3) };
2704

    
2705
        for(y=0; y<h; y++){
2706
            asm volatile(
2707
                "movq   %0,  %%mm4 \n\t"
2708
                "movq   %1,  %%mm5 \n\t"
2709
                "paddw  %2,  %%mm4 \n\t"
2710
                "paddw  %3,  %%mm5 \n\t"
2711
                "movq   %%mm4, %0  \n\t"
2712
                "movq   %%mm5, %1  \n\t"
2713
                "psrlw  $12, %%mm4 \n\t"
2714
                "psrlw  $12, %%mm5 \n\t"
2715
                : "+m"(*dx4), "+m"(*dy4)
2716
                : "m"(*dxy4), "m"(*dyy4)
2717
            );
2718

    
2719
            asm volatile(
2720
                "movq   %%mm6, %%mm2 \n\t"
2721
                "movq   %%mm6, %%mm1 \n\t"
2722
                "psubw  %%mm4, %%mm2 \n\t"
2723
                "psubw  %%mm5, %%mm1 \n\t"
2724
                "movq   %%mm2, %%mm0 \n\t"
2725
                "movq   %%mm4, %%mm3 \n\t"
2726
                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2727
                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2728
                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2729
                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2730

    
2731
                "movd   %4,    %%mm5 \n\t"
2732
                "movd   %3,    %%mm4 \n\t"
2733
                "punpcklbw %%mm7, %%mm5 \n\t"
2734
                "punpcklbw %%mm7, %%mm4 \n\t"
2735
                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2736
                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2737

    
2738
                "movd   %2,    %%mm5 \n\t"
2739
                "movd   %1,    %%mm4 \n\t"
2740
                "punpcklbw %%mm7, %%mm5 \n\t"
2741
                "punpcklbw %%mm7, %%mm4 \n\t"
2742
                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2743
                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2744
                "paddw  %5,    %%mm1 \n\t"
2745
                "paddw  %%mm3, %%mm2 \n\t"
2746
                "paddw  %%mm1, %%mm0 \n\t"
2747
                "paddw  %%mm2, %%mm0 \n\t"
2748

    
2749
                "psrlw    %6,    %%mm0 \n\t"
2750
                "packuswb %%mm0, %%mm0 \n\t"
2751
                "movd     %%mm0, %0    \n\t"
2752

    
2753
                : "=m"(dst[x+y*stride])
2754
                : "m"(src[0]), "m"(src[1]),
2755
                  "m"(src[stride]), "m"(src[stride+1]),
2756
                  "m"(*r4), "m"(shift2)
2757
            );
2758
            src += stride;
2759
        }
2760
        src += 4-h*stride;
2761
    }
2762
}
2763

    
2764
#ifdef CONFIG_ENCODERS
2765

    
2766
#define PHADDD(a, t)\
2767
    "movq "#a", "#t"                  \n\t"\
2768
    "psrlq $32, "#a"                  \n\t"\
2769
    "paddd "#t", "#a"                 \n\t"
2770
/*
2771
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2772
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2773
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2774
 */
2775
#define PMULHRW(x, y, s, o)\
2776
    "pmulhw " #s ", "#x "            \n\t"\
2777
    "pmulhw " #s ", "#y "            \n\t"\
2778
    "paddw " #o ", "#x "             \n\t"\
2779
    "paddw " #o ", "#y "             \n\t"\
2780
    "psraw $1, "#x "                 \n\t"\
2781
    "psraw $1, "#y "                 \n\t"
2782
#define DEF(x) x ## _mmx
2783
#define SET_RND MOVQ_WONE
2784
#define SCALE_OFFSET 1
2785

    
2786
#include "dsputil_mmx_qns.h"
2787

    
2788
#undef DEF
2789
#undef SET_RND
2790
#undef SCALE_OFFSET
2791
#undef PMULHRW
2792

    
2793
#define DEF(x) x ## _3dnow
2794
#define SET_RND(x)
2795
#define SCALE_OFFSET 0
2796
#define PMULHRW(x, y, s, o)\
2797
    "pmulhrw " #s ", "#x "           \n\t"\
2798
    "pmulhrw " #s ", "#y "           \n\t"
2799

    
2800
#include "dsputil_mmx_qns.h"
2801

    
2802
#undef DEF
2803
#undef SET_RND
2804
#undef SCALE_OFFSET
2805
#undef PMULHRW
2806

    
2807
#ifdef HAVE_SSSE3
2808
#undef PHADDD
2809
#define DEF(x) x ## _ssse3
2810
#define SET_RND(x)
2811
#define SCALE_OFFSET -1
2812
#define PHADDD(a, t)\
2813
    "pshufw $0x0E, "#a", "#t"         \n\t"\
2814
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
2815
#define PMULHRW(x, y, s, o)\
2816
    "pmulhrsw " #s ", "#x "          \n\t"\
2817
    "pmulhrsw " #s ", "#y "          \n\t"
2818

    
2819
#include "dsputil_mmx_qns.h"
2820

    
2821
#undef DEF
2822
#undef SET_RND
2823
#undef SCALE_OFFSET
2824
#undef PMULHRW
2825
#undef PHADDD
2826
#endif //HAVE_SSSE3
2827

    
2828
#endif /* CONFIG_ENCODERS */
2829

    
2830
#define PREFETCH(name, op) \
2831
static void name(void *mem, int stride, int h){\
2832
    const uint8_t *p= mem;\
2833
    do{\
2834
        asm volatile(#op" %0" :: "m"(*p));\
2835
        p+= stride;\
2836
    }while(--h);\
2837
}
2838
PREFETCH(prefetch_mmx2,  prefetcht0)
2839
PREFETCH(prefetch_3dnow, prefetch)
2840
#undef PREFETCH
2841

    
2842
#include "h264dsp_mmx.c"
2843

    
2844
/* CAVS specific */
2845
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2846

    
2847
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2848
    put_pixels8_mmx(dst, src, stride, 8);
2849
}
2850
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2851
    avg_pixels8_mmx(dst, src, stride, 8);
2852
}
2853
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2854
    put_pixels16_mmx(dst, src, stride, 16);
2855
}
2856
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2857
    avg_pixels16_mmx(dst, src, stride, 16);
2858
}
2859

    
2860
/* FLAC specific */
2861
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2862
                                   double *autoc);
2863

    
2864
/* VC1 specific */
2865
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2866

    
2867
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2868
    put_pixels8_mmx(dst, src, stride, 8);
2869
}
2870

    
2871
/* external functions, from idct_mmx.c */
2872
void ff_mmx_idct(DCTELEM *block);
2873
void ff_mmxext_idct(DCTELEM *block);
2874

    
2875
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2876
   converted */
2877
#ifdef CONFIG_GPL
2878
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2879
{
2880
    ff_mmx_idct (block);
2881
    put_pixels_clamped_mmx(block, dest, line_size);
2882
}
2883
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2884
{
2885
    ff_mmx_idct (block);
2886
    add_pixels_clamped_mmx(block, dest, line_size);
2887
}
2888
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2889
{
2890
    ff_mmxext_idct (block);
2891
    put_pixels_clamped_mmx(block, dest, line_size);
2892
}
2893
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2894
{
2895
    ff_mmxext_idct (block);
2896
    add_pixels_clamped_mmx(block, dest, line_size);
2897
}
2898
#endif
2899
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2900
{
2901
    ff_idct_xvid_mmx (block);
2902
    put_pixels_clamped_mmx(block, dest, line_size);
2903
}
2904
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2905
{
2906
    ff_idct_xvid_mmx (block);
2907
    add_pixels_clamped_mmx(block, dest, line_size);
2908
}
2909
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2910
{
2911
    ff_idct_xvid_mmx2 (block);
2912
    put_pixels_clamped_mmx(block, dest, line_size);
2913
}
2914
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2915
{
2916
    ff_idct_xvid_mmx2 (block);
2917
    add_pixels_clamped_mmx(block, dest, line_size);
2918
}
2919

    
2920
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2921
{
2922
    int i;
2923
    asm volatile("pxor %%mm7, %%mm7":);
2924
    for(i=0; i<blocksize; i+=2) {
2925
        asm volatile(
2926
            "movq    %0,    %%mm0 \n\t"
2927
            "movq    %1,    %%mm1 \n\t"
2928
            "movq    %%mm0, %%mm2 \n\t"
2929
            "movq    %%mm1, %%mm3 \n\t"
2930
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2931
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2932
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2933
            "pxor    %%mm2, %%mm1 \n\t"
2934
            "movq    %%mm3, %%mm4 \n\t"
2935
            "pand    %%mm1, %%mm3 \n\t"
2936
            "pandn   %%mm1, %%mm4 \n\t"
2937
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2938
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2939
            "movq    %%mm3, %1    \n\t"
2940
            "movq    %%mm0, %0    \n\t"
2941
            :"+m"(mag[i]), "+m"(ang[i])
2942
            ::"memory"
2943
        );
2944
    }
2945
    asm volatile("femms");
2946
}
2947
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2948
{
2949
    int i;
2950

    
2951
    asm volatile(
2952
            "movaps  %0,     %%xmm5 \n\t"
2953
        ::"m"(ff_pdw_80000000[0])
2954
    );
2955
    for(i=0; i<blocksize; i+=4) {
2956
        asm volatile(
2957
            "movaps  %0,     %%xmm0 \n\t"
2958
            "movaps  %1,     %%xmm1 \n\t"
2959
            "xorps   %%xmm2, %%xmm2 \n\t"
2960
            "xorps   %%xmm3, %%xmm3 \n\t"
2961
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2962
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2963
            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2964
            "xorps   %%xmm2, %%xmm1 \n\t"
2965
            "movaps  %%xmm3, %%xmm4 \n\t"
2966
            "andps   %%xmm1, %%xmm3 \n\t"
2967
            "andnps  %%xmm1, %%xmm4 \n\t"
2968
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2969
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2970
            "movaps  %%xmm3, %1     \n\t"
2971
            "movaps  %%xmm0, %0     \n\t"
2972
            :"+m"(mag[i]), "+m"(ang[i])
2973
            ::"memory"
2974
        );
2975
    }
2976
}
2977

    
2978
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2979
    long i = (len-4)*4;
2980
    asm volatile(
2981
        "1: \n\t"
2982
        "movq    (%1,%0), %%mm0 \n\t"
2983
        "movq   8(%1,%0), %%mm1 \n\t"
2984
        "pfmul   (%2,%0), %%mm0 \n\t"
2985
        "pfmul  8(%2,%0), %%mm1 \n\t"
2986
        "movq   %%mm0,  (%1,%0) \n\t"
2987
        "movq   %%mm1, 8(%1,%0) \n\t"
2988
        "sub  $16, %0 \n\t"
2989
        "jge 1b \n\t"
2990
        "femms  \n\t"
2991
        :"+r"(i)
2992
        :"r"(dst), "r"(src)
2993
        :"memory"
2994
    );
2995
}
2996
static void vector_fmul_sse(float *dst, const float *src, int len){
2997
    long i = (len-8)*4;
2998
    asm volatile(
2999
        "1: \n\t"
3000
        "movaps    (%1,%0), %%xmm0 \n\t"
3001
        "movaps  16(%1,%0), %%xmm1 \n\t"
3002
        "mulps     (%2,%0), %%xmm0 \n\t"
3003
        "mulps   16(%2,%0), %%xmm1 \n\t"
3004
        "movaps  %%xmm0,   (%1,%0) \n\t"
3005
        "movaps  %%xmm1, 16(%1,%0) \n\t"
3006
        "sub  $32, %0 \n\t"
3007
        "jge 1b \n\t"
3008
        :"+r"(i)
3009
        :"r"(dst), "r"(src)
3010
        :"memory"
3011
    );
3012
}
3013

    
3014
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
3015
    long i = len*4-16;
3016
    asm volatile(
3017
        "1: \n\t"
3018
        "pswapd   8(%1), %%mm0 \n\t"
3019
        "pswapd    (%1), %%mm1 \n\t"
3020
        "pfmul  (%3,%0), %%mm0 \n\t"
3021
        "pfmul 8(%3,%0), %%mm1 \n\t"
3022
        "movq  %%mm0,  (%2,%0) \n\t"
3023
        "movq  %%mm1, 8(%2,%0) \n\t"
3024
        "add   $16, %1 \n\t"
3025
        "sub   $16, %0 \n\t"
3026
        "jge   1b \n\t"
3027
        :"+r"(i), "+r"(src1)
3028
        :"r"(dst), "r"(src0)
3029
    );
3030
    asm volatile("femms");
3031
}
3032
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3033
    long i = len*4-32;
3034
    asm volatile(
3035
        "1: \n\t"
3036
        "movaps        16(%1), %%xmm0 \n\t"
3037
        "movaps          (%1), %%xmm1 \n\t"
3038
        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3039
        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3040
        "mulps        (%3,%0), %%xmm0 \n\t"
3041
        "mulps      16(%3,%0), %%xmm1 \n\t"
3042
        "movaps     %%xmm0,   (%2,%0) \n\t"
3043
        "movaps     %%xmm1, 16(%2,%0) \n\t"
3044
        "add    $32, %1 \n\t"
3045
        "sub    $32, %0 \n\t"
3046
        "jge    1b \n\t"
3047
        :"+r"(i), "+r"(src1)
3048
        :"r"(dst), "r"(src0)
3049
    );
3050
}
3051

    
3052
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3053
                                      const float *src2, int src3, int len, int step){
3054
    long i = (len-4)*4;
3055
    if(step == 2 && src3 == 0){
3056
        dst += (len-4)*2;
3057
        asm volatile(
3058
            "1: \n\t"
3059
            "movq   (%2,%0),  %%mm0 \n\t"
3060
            "movq  8(%2,%0),  %%mm1 \n\t"
3061
            "pfmul  (%3,%0),  %%mm0 \n\t"
3062
            "pfmul 8(%3,%0),  %%mm1 \n\t"
3063
            "pfadd  (%4,%0),  %%mm0 \n\t"
3064
            "pfadd 8(%4,%0),  %%mm1 \n\t"
3065
            "movd     %%mm0,   (%1) \n\t"
3066
            "movd     %%mm1, 16(%1) \n\t"
3067
            "psrlq      $32,  %%mm0 \n\t"
3068
            "psrlq      $32,  %%mm1 \n\t"
3069
            "movd     %%mm0,  8(%1) \n\t"
3070
            "movd     %%mm1, 24(%1) \n\t"
3071
            "sub  $32, %1 \n\t"
3072
            "sub  $16, %0 \n\t"
3073
            "jge  1b \n\t"
3074
            :"+r"(i), "+r"(dst)
3075
            :"r"(src0), "r"(src1), "r"(src2)
3076
            :"memory"
3077
        );
3078
    }
3079
    else if(step == 1 && src3 == 0){
3080
        asm volatile(
3081
            "1: \n\t"
3082
            "movq    (%2,%0), %%mm0 \n\t"
3083
            "movq   8(%2,%0), %%mm1 \n\t"
3084
            "pfmul   (%3,%0), %%mm0 \n\t"
3085
            "pfmul  8(%3,%0), %%mm1 \n\t"
3086
            "pfadd   (%4,%0), %%mm0 \n\t"
3087
            "pfadd  8(%4,%0), %%mm1 \n\t"
3088
            "movq  %%mm0,   (%1,%0) \n\t"
3089
            "movq  %%mm1,  8(%1,%0) \n\t"
3090
            "sub  $16, %0 \n\t"
3091
            "jge  1b \n\t"
3092
            :"+r"(i)
3093
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3094
            :"memory"
3095
        );
3096
    }
3097
    else
3098
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3099
    asm volatile("femms");
3100
}
3101
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3102
                                    const float *src2, int src3, int len, int step){
3103
    long i = (len-8)*4;
3104
    if(step == 2 && src3 == 0){
3105
        dst += (len-8)*2;
3106
        asm volatile(
3107
            "1: \n\t"
3108
            "movaps   (%2,%0), %%xmm0 \n\t"
3109
            "movaps 16(%2,%0), %%xmm1 \n\t"
3110
            "mulps    (%3,%0), %%xmm0 \n\t"
3111
            "mulps  16(%3,%0), %%xmm1 \n\t"
3112
            "addps    (%4,%0), %%xmm0 \n\t"
3113
            "addps  16(%4,%0), %%xmm1 \n\t"
3114
            "movss     %%xmm0,   (%1) \n\t"
3115
            "movss     %%xmm1, 32(%1) \n\t"
3116
            "movhlps   %%xmm0, %%xmm2 \n\t"
3117
            "movhlps   %%xmm1, %%xmm3 \n\t"
3118
            "movss     %%xmm2, 16(%1) \n\t"
3119
            "movss     %%xmm3, 48(%1) \n\t"
3120
            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3121
            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3122
            "movss     %%xmm0,  8(%1) \n\t"
3123
            "movss     %%xmm1, 40(%1) \n\t"
3124
            "movhlps   %%xmm0, %%xmm2 \n\t"
3125
            "movhlps   %%xmm1, %%xmm3 \n\t"
3126
            "movss     %%xmm2, 24(%1) \n\t"
3127
            "movss     %%xmm3, 56(%1) \n\t"
3128
            "sub  $64, %1 \n\t"
3129
            "sub  $32, %0 \n\t"
3130
            "jge  1b \n\t"
3131
            :"+r"(i), "+r"(dst)
3132
            :"r"(src0), "r"(src1), "r"(src2)
3133
            :"memory"
3134
        );
3135
    }
3136
    else if(step == 1 && src3 == 0){
3137
        asm volatile(
3138
            "1: \n\t"
3139
            "movaps   (%2,%0), %%xmm0 \n\t"
3140
            "movaps 16(%2,%0), %%xmm1 \n\t"
3141
            "mulps    (%3,%0), %%xmm0 \n\t"
3142
            "mulps  16(%3,%0), %%xmm1 \n\t"
3143
            "addps    (%4,%0), %%xmm0 \n\t"
3144
            "addps  16(%4,%0), %%xmm1 \n\t"
3145
            "movaps %%xmm0,   (%1,%0) \n\t"
3146
            "movaps %%xmm1, 16(%1,%0) \n\t"
3147
            "sub  $32, %0 \n\t"
3148
            "jge  1b \n\t"
3149
            :"+r"(i)
3150
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3151
            :"memory"
3152
        );
3153
    }
3154
    else
3155
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3156
}
3157

    
3158
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3159
    // not bit-exact: pf2id uses different rounding than C and SSE
3160
    int i;
3161
    for(i=0; i<len; i+=4) {
3162
        asm volatile(
3163
            "pf2id       %1, %%mm0 \n\t"
3164
            "pf2id       %2, %%mm1 \n\t"
3165
            "packssdw %%mm1, %%mm0 \n\t"
3166
            "movq     %%mm0, %0    \n\t"
3167
            :"=m"(dst[i])
3168
            :"m"(src[i]), "m"(src[i+2])
3169
        );
3170
    }
3171
    asm volatile("femms");
3172
}
3173
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3174
    int i;
3175
    for(i=0; i<len; i+=4) {
3176
        asm volatile(
3177
            "cvtps2pi    %1, %%mm0 \n\t"
3178
            "cvtps2pi    %2, %%mm1 \n\t"
3179
            "packssdw %%mm1, %%mm0 \n\t"
3180
            "movq     %%mm0, %0    \n\t"
3181
            :"=m"(dst[i])
3182
            :"m"(src[i]), "m"(src[i+2])
3183
        );
3184
    }
3185
    asm volatile("emms");
3186
}
3187

    
3188
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3189
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3190
extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3191
extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3192
extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3193
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3194
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3195
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3196

    
3197
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3198
{
3199
    mm_flags = mm_support();
3200

    
3201
    if (avctx->dsp_mask) {
3202
        if (avctx->dsp_mask & FF_MM_FORCE)
3203
            mm_flags |= (avctx->dsp_mask & 0xffff);
3204
        else
3205
            mm_flags &= ~(avctx->dsp_mask & 0xffff);
3206
    }
3207

    
3208
#if 0
3209
    av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3210
    if (mm_flags & MM_MMX)
3211