Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 9b5dc867

History | View | Annotate | Download (140 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "../dsputil.h"
26
#include "../simple_idct.h"
27
#include "../mpegvideo.h"
28
#include "x86_cpu.h"
29
#include "mmx.h"
30

    
31
//#undef NDEBUG
32
//#include <assert.h>
33

    
34
extern void ff_idct_xvid_mmx(short *block);
35
extern void ff_idct_xvid_mmx2(short *block);
36

    
37
int mm_flags; /* multimedia extension flags */
38

    
39
/* pixel operations */
40
static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
41
static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
42
static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
43

    
44
static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
45
{0x8000000080000000ULL, 0x8000000080000000ULL};
46

    
47
static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
48
static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
49
static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
50
static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
51
static const uint64_t ff_pw_8  attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
52
static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
53
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
54
static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
55
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
56

    
57
static const uint64_t ff_pb_1  attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
58
static const uint64_t ff_pb_3  attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
59
static const uint64_t ff_pb_7  attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
60
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
61
static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
62
static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
63
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
64

    
65
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
66
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
67

    
68
#define MOVQ_WONE(regd) \
69
    __asm __volatile ( \
70
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
71
    "psrlw $15, %%" #regd ::)
72

    
73
#define MOVQ_BFE(regd) \
74
    __asm __volatile ( \
75
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
76
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
77

    
78
#ifndef PIC
79
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
80
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
81
#else
82
// for shared library it's better to use this way for accessing constants
83
// pcmpeqd -> -1
84
#define MOVQ_BONE(regd) \
85
    __asm __volatile ( \
86
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
87
    "psrlw $15, %%" #regd " \n\t" \
88
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
89

    
90
#define MOVQ_WTWO(regd) \
91
    __asm __volatile ( \
92
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
93
    "psrlw $15, %%" #regd " \n\t" \
94
    "psllw $1, %%" #regd " \n\t"::)
95

    
96
#endif
97

    
98
// using regr as temporary and for the output result
99
// first argument is unmodifed and second is trashed
100
// regfe is supposed to contain 0xfefefefefefefefe
101
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
102
    "movq " #rega ", " #regr "  \n\t"\
103
    "pand " #regb ", " #regr "  \n\t"\
104
    "pxor " #rega ", " #regb "  \n\t"\
105
    "pand " #regfe "," #regb "  \n\t"\
106
    "psrlq $1, " #regb "        \n\t"\
107
    "paddb " #regb ", " #regr " \n\t"
108

    
109
#define PAVGB_MMX(rega, regb, regr, regfe) \
110
    "movq " #rega ", " #regr "  \n\t"\
111
    "por  " #regb ", " #regr "  \n\t"\
112
    "pxor " #rega ", " #regb "  \n\t"\
113
    "pand " #regfe "," #regb "  \n\t"\
114
    "psrlq $1, " #regb "        \n\t"\
115
    "psubb " #regb ", " #regr " \n\t"
116

    
117
// mm6 is supposed to contain 0xfefefefefefefefe
118
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
119
    "movq " #rega ", " #regr "  \n\t"\
120
    "movq " #regc ", " #regp "  \n\t"\
121
    "pand " #regb ", " #regr "  \n\t"\
122
    "pand " #regd ", " #regp "  \n\t"\
123
    "pxor " #rega ", " #regb "  \n\t"\
124
    "pxor " #regc ", " #regd "  \n\t"\
125
    "pand %%mm6, " #regb "      \n\t"\
126
    "pand %%mm6, " #regd "      \n\t"\
127
    "psrlq $1, " #regb "        \n\t"\
128
    "psrlq $1, " #regd "        \n\t"\
129
    "paddb " #regb ", " #regr " \n\t"\
130
    "paddb " #regd ", " #regp " \n\t"
131

    
132
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
133
    "movq " #rega ", " #regr "  \n\t"\
134
    "movq " #regc ", " #regp "  \n\t"\
135
    "por  " #regb ", " #regr "  \n\t"\
136
    "por  " #regd ", " #regp "  \n\t"\
137
    "pxor " #rega ", " #regb "  \n\t"\
138
    "pxor " #regc ", " #regd "  \n\t"\
139
    "pand %%mm6, " #regb "      \n\t"\
140
    "pand %%mm6, " #regd "      \n\t"\
141
    "psrlq $1, " #regd "        \n\t"\
142
    "psrlq $1, " #regb "        \n\t"\
143
    "psubb " #regb ", " #regr " \n\t"\
144
    "psubb " #regd ", " #regp " \n\t"
145

    
146
/***********************************/
147
/* MMX no rounding */
148
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
149
#define SET_RND  MOVQ_WONE
150
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
151
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
152

    
153
#include "dsputil_mmx_rnd.h"
154

    
155
#undef DEF
156
#undef SET_RND
157
#undef PAVGBP
158
#undef PAVGB
159
/***********************************/
160
/* MMX rounding */
161

    
162
#define DEF(x, y) x ## _ ## y ##_mmx
163
#define SET_RND  MOVQ_WTWO
164
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
165
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
166

    
167
#include "dsputil_mmx_rnd.h"
168

    
169
#undef DEF
170
#undef SET_RND
171
#undef PAVGBP
172
#undef PAVGB
173

    
174
/***********************************/
175
/* 3Dnow specific */
176

    
177
#define DEF(x) x ## _3dnow
178
/* for Athlons PAVGUSB is preferred */
179
#define PAVGB "pavgusb"
180

    
181
#include "dsputil_mmx_avg.h"
182

    
183
#undef DEF
184
#undef PAVGB
185

    
186
/***********************************/
187
/* MMX2 specific */
188

    
189
#define DEF(x) x ## _mmx2
190

    
191
/* Introduced only in MMX2 set */
192
#define PAVGB "pavgb"
193

    
194
#include "dsputil_mmx_avg.h"
195

    
196
#undef DEF
197
#undef PAVGB
198

    
199
#define SBUTTERFLY(a,b,t,n,m)\
200
    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
201
    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
202
    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
203

    
204
#define TRANSPOSE4(a,b,c,d,t)\
205
    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
206
    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
207
    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
208
    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
209

    
210
/***********************************/
211
/* standard MMX */
212

    
213
#ifdef CONFIG_ENCODERS
214
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
215
{
216
    asm volatile(
217
        "mov $-128, %%"REG_a"           \n\t"
218
        "pxor %%mm7, %%mm7              \n\t"
219
        ASMALIGN(4)
220
        "1:                             \n\t"
221
        "movq (%0), %%mm0               \n\t"
222
        "movq (%0, %2), %%mm2           \n\t"
223
        "movq %%mm0, %%mm1              \n\t"
224
        "movq %%mm2, %%mm3              \n\t"
225
        "punpcklbw %%mm7, %%mm0         \n\t"
226
        "punpckhbw %%mm7, %%mm1         \n\t"
227
        "punpcklbw %%mm7, %%mm2         \n\t"
228
        "punpckhbw %%mm7, %%mm3         \n\t"
229
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
230
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
231
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
232
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
233
        "add %3, %0                     \n\t"
234
        "add $32, %%"REG_a"             \n\t"
235
        "js 1b                          \n\t"
236
        : "+r" (pixels)
237
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
238
        : "%"REG_a
239
    );
240
}
241

    
242
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
243
{
244
    asm volatile(
245
        "pxor %%mm7, %%mm7              \n\t"
246
        "mov $-128, %%"REG_a"           \n\t"
247
        ASMALIGN(4)
248
        "1:                             \n\t"
249
        "movq (%0), %%mm0               \n\t"
250
        "movq (%1), %%mm2               \n\t"
251
        "movq %%mm0, %%mm1              \n\t"
252
        "movq %%mm2, %%mm3              \n\t"
253
        "punpcklbw %%mm7, %%mm0         \n\t"
254
        "punpckhbw %%mm7, %%mm1         \n\t"
255
        "punpcklbw %%mm7, %%mm2         \n\t"
256
        "punpckhbw %%mm7, %%mm3         \n\t"
257
        "psubw %%mm2, %%mm0             \n\t"
258
        "psubw %%mm3, %%mm1             \n\t"
259
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
260
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
261
        "add %3, %0                     \n\t"
262
        "add %3, %1                     \n\t"
263
        "add $16, %%"REG_a"             \n\t"
264
        "jnz 1b                         \n\t"
265
        : "+r" (s1), "+r" (s2)
266
        : "r" (block+64), "r" ((long)stride)
267
        : "%"REG_a
268
    );
269
}
270
#endif //CONFIG_ENCODERS
271

    
272
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
273
{
274
    const DCTELEM *p;
275
    uint8_t *pix;
276

    
277
    /* read the pixels */
278
    p = block;
279
    pix = pixels;
280
    /* unrolled loop */
281
        __asm __volatile(
282
                "movq   %3, %%mm0               \n\t"
283
                "movq   8%3, %%mm1              \n\t"
284
                "movq   16%3, %%mm2             \n\t"
285
                "movq   24%3, %%mm3             \n\t"
286
                "movq   32%3, %%mm4             \n\t"
287
                "movq   40%3, %%mm5             \n\t"
288
                "movq   48%3, %%mm6             \n\t"
289
                "movq   56%3, %%mm7             \n\t"
290
                "packuswb %%mm1, %%mm0          \n\t"
291
                "packuswb %%mm3, %%mm2          \n\t"
292
                "packuswb %%mm5, %%mm4          \n\t"
293
                "packuswb %%mm7, %%mm6          \n\t"
294
                "movq   %%mm0, (%0)             \n\t"
295
                "movq   %%mm2, (%0, %1)         \n\t"
296
                "movq   %%mm4, (%0, %1, 2)      \n\t"
297
                "movq   %%mm6, (%0, %2)         \n\t"
298
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
299
                :"memory");
300
        pix += line_size*4;
301
        p += 32;
302

    
303
    // if here would be an exact copy of the code above
304
    // compiler would generate some very strange code
305
    // thus using "r"
306
    __asm __volatile(
307
            "movq       (%3), %%mm0             \n\t"
308
            "movq       8(%3), %%mm1            \n\t"
309
            "movq       16(%3), %%mm2           \n\t"
310
            "movq       24(%3), %%mm3           \n\t"
311
            "movq       32(%3), %%mm4           \n\t"
312
            "movq       40(%3), %%mm5           \n\t"
313
            "movq       48(%3), %%mm6           \n\t"
314
            "movq       56(%3), %%mm7           \n\t"
315
            "packuswb %%mm1, %%mm0              \n\t"
316
            "packuswb %%mm3, %%mm2              \n\t"
317
            "packuswb %%mm5, %%mm4              \n\t"
318
            "packuswb %%mm7, %%mm6              \n\t"
319
            "movq       %%mm0, (%0)             \n\t"
320
            "movq       %%mm2, (%0, %1)         \n\t"
321
            "movq       %%mm4, (%0, %1, 2)      \n\t"
322
            "movq       %%mm6, (%0, %2)         \n\t"
323
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
324
            :"memory");
325
}
326

    
327
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
328
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
329

    
330
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
331
{
332
    int i;
333

    
334
    movq_m2r(*vector128, mm1);
335
    for (i = 0; i < 8; i++) {
336
        movq_m2r(*(block), mm0);
337
        packsswb_m2r(*(block + 4), mm0);
338
        block += 8;
339
        paddb_r2r(mm1, mm0);
340
        movq_r2m(mm0, *pixels);
341
        pixels += line_size;
342
    }
343
}
344

    
345
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
346
{
347
    const DCTELEM *p;
348
    uint8_t *pix;
349
    int i;
350

    
351
    /* read the pixels */
352
    p = block;
353
    pix = pixels;
354
    MOVQ_ZERO(mm7);
355
    i = 4;
356
    do {
357
        __asm __volatile(
358
                "movq   (%2), %%mm0     \n\t"
359
                "movq   8(%2), %%mm1    \n\t"
360
                "movq   16(%2), %%mm2   \n\t"
361
                "movq   24(%2), %%mm3   \n\t"
362
                "movq   %0, %%mm4       \n\t"
363
                "movq   %1, %%mm6       \n\t"
364
                "movq   %%mm4, %%mm5    \n\t"
365
                "punpcklbw %%mm7, %%mm4 \n\t"
366
                "punpckhbw %%mm7, %%mm5 \n\t"
367
                "paddsw %%mm4, %%mm0    \n\t"
368
                "paddsw %%mm5, %%mm1    \n\t"
369
                "movq   %%mm6, %%mm5    \n\t"
370
                "punpcklbw %%mm7, %%mm6 \n\t"
371
                "punpckhbw %%mm7, %%mm5 \n\t"
372
                "paddsw %%mm6, %%mm2    \n\t"
373
                "paddsw %%mm5, %%mm3    \n\t"
374
                "packuswb %%mm1, %%mm0  \n\t"
375
                "packuswb %%mm3, %%mm2  \n\t"
376
                "movq   %%mm0, %0       \n\t"
377
                "movq   %%mm2, %1       \n\t"
378
                :"+m"(*pix), "+m"(*(pix+line_size))
379
                :"r"(p)
380
                :"memory");
381
        pix += line_size*2;
382
        p += 16;
383
    } while (--i);
384
}
385

    
386
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
387
{
388
    __asm __volatile(
389
         "lea (%3, %3), %%"REG_a"       \n\t"
390
         ASMALIGN(3)
391
         "1:                            \n\t"
392
         "movd (%1), %%mm0              \n\t"
393
         "movd (%1, %3), %%mm1          \n\t"
394
         "movd %%mm0, (%2)              \n\t"
395
         "movd %%mm1, (%2, %3)          \n\t"
396
         "add %%"REG_a", %1             \n\t"
397
         "add %%"REG_a", %2             \n\t"
398
         "movd (%1), %%mm0              \n\t"
399
         "movd (%1, %3), %%mm1          \n\t"
400
         "movd %%mm0, (%2)              \n\t"
401
         "movd %%mm1, (%2, %3)          \n\t"
402
         "add %%"REG_a", %1             \n\t"
403
         "add %%"REG_a", %2             \n\t"
404
         "subl $4, %0                   \n\t"
405
         "jnz 1b                        \n\t"
406
         : "+g"(h), "+r" (pixels),  "+r" (block)
407
         : "r"((long)line_size)
408
         : "%"REG_a, "memory"
409
        );
410
}
411

    
412
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
413
{
414
    __asm __volatile(
415
         "lea (%3, %3), %%"REG_a"       \n\t"
416
         ASMALIGN(3)
417
         "1:                            \n\t"
418
         "movq (%1), %%mm0              \n\t"
419
         "movq (%1, %3), %%mm1          \n\t"
420
         "movq %%mm0, (%2)              \n\t"
421
         "movq %%mm1, (%2, %3)          \n\t"
422
         "add %%"REG_a", %1             \n\t"
423
         "add %%"REG_a", %2             \n\t"
424
         "movq (%1), %%mm0              \n\t"
425
         "movq (%1, %3), %%mm1          \n\t"
426
         "movq %%mm0, (%2)              \n\t"
427
         "movq %%mm1, (%2, %3)          \n\t"
428
         "add %%"REG_a", %1             \n\t"
429
         "add %%"REG_a", %2             \n\t"
430
         "subl $4, %0                   \n\t"
431
         "jnz 1b                        \n\t"
432
         : "+g"(h), "+r" (pixels),  "+r" (block)
433
         : "r"((long)line_size)
434
         : "%"REG_a, "memory"
435
        );
436
}
437

    
438
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
439
{
440
    __asm __volatile(
441
         "lea (%3, %3), %%"REG_a"       \n\t"
442
         ASMALIGN(3)
443
         "1:                            \n\t"
444
         "movq (%1), %%mm0              \n\t"
445
         "movq 8(%1), %%mm4             \n\t"
446
         "movq (%1, %3), %%mm1          \n\t"
447
         "movq 8(%1, %3), %%mm5         \n\t"
448
         "movq %%mm0, (%2)              \n\t"
449
         "movq %%mm4, 8(%2)             \n\t"
450
         "movq %%mm1, (%2, %3)          \n\t"
451
         "movq %%mm5, 8(%2, %3)         \n\t"
452
         "add %%"REG_a", %1             \n\t"
453
         "add %%"REG_a", %2             \n\t"
454
         "movq (%1), %%mm0              \n\t"
455
         "movq 8(%1), %%mm4             \n\t"
456
         "movq (%1, %3), %%mm1          \n\t"
457
         "movq 8(%1, %3), %%mm5         \n\t"
458
         "movq %%mm0, (%2)              \n\t"
459
         "movq %%mm4, 8(%2)             \n\t"
460
         "movq %%mm1, (%2, %3)          \n\t"
461
         "movq %%mm5, 8(%2, %3)         \n\t"
462
         "add %%"REG_a", %1             \n\t"
463
         "add %%"REG_a", %2             \n\t"
464
         "subl $4, %0                   \n\t"
465
         "jnz 1b                        \n\t"
466
         : "+g"(h), "+r" (pixels),  "+r" (block)
467
         : "r"((long)line_size)
468
         : "%"REG_a, "memory"
469
        );
470
}
471

    
472
static void clear_blocks_mmx(DCTELEM *blocks)
473
{
474
    __asm __volatile(
475
                "pxor %%mm7, %%mm7              \n\t"
476
                "mov $-128*6, %%"REG_a"         \n\t"
477
                "1:                             \n\t"
478
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
479
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
480
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
481
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
482
                "add $32, %%"REG_a"             \n\t"
483
                " js 1b                         \n\t"
484
                : : "r" (((uint8_t *)blocks)+128*6)
485
                : "%"REG_a
486
        );
487
}
488

    
489
#ifdef CONFIG_ENCODERS
490
static int pix_sum16_mmx(uint8_t * pix, int line_size){
491
    const int h=16;
492
    int sum;
493
    long index= -line_size*h;
494

    
495
    __asm __volatile(
496
                "pxor %%mm7, %%mm7              \n\t"
497
                "pxor %%mm6, %%mm6              \n\t"
498
                "1:                             \n\t"
499
                "movq (%2, %1), %%mm0           \n\t"
500
                "movq (%2, %1), %%mm1           \n\t"
501
                "movq 8(%2, %1), %%mm2          \n\t"
502
                "movq 8(%2, %1), %%mm3          \n\t"
503
                "punpcklbw %%mm7, %%mm0         \n\t"
504
                "punpckhbw %%mm7, %%mm1         \n\t"
505
                "punpcklbw %%mm7, %%mm2         \n\t"
506
                "punpckhbw %%mm7, %%mm3         \n\t"
507
                "paddw %%mm0, %%mm1             \n\t"
508
                "paddw %%mm2, %%mm3             \n\t"
509
                "paddw %%mm1, %%mm3             \n\t"
510
                "paddw %%mm3, %%mm6             \n\t"
511
                "add %3, %1                     \n\t"
512
                " js 1b                         \n\t"
513
                "movq %%mm6, %%mm5              \n\t"
514
                "psrlq $32, %%mm6               \n\t"
515
                "paddw %%mm5, %%mm6             \n\t"
516
                "movq %%mm6, %%mm5              \n\t"
517
                "psrlq $16, %%mm6               \n\t"
518
                "paddw %%mm5, %%mm6             \n\t"
519
                "movd %%mm6, %0                 \n\t"
520
                "andl $0xFFFF, %0               \n\t"
521
                : "=&r" (sum), "+r" (index)
522
                : "r" (pix - index), "r" ((long)line_size)
523
        );
524

    
525
        return sum;
526
}
527
#endif //CONFIG_ENCODERS
528

    
529
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
530
    long i=0;
531
    asm volatile(
532
        "1:                             \n\t"
533
        "movq  (%1, %0), %%mm0          \n\t"
534
        "movq  (%2, %0), %%mm1          \n\t"
535
        "paddb %%mm0, %%mm1             \n\t"
536
        "movq %%mm1, (%2, %0)           \n\t"
537
        "movq 8(%1, %0), %%mm0          \n\t"
538
        "movq 8(%2, %0), %%mm1          \n\t"
539
        "paddb %%mm0, %%mm1             \n\t"
540
        "movq %%mm1, 8(%2, %0)          \n\t"
541
        "add $16, %0                    \n\t"
542
        "cmp %3, %0                     \n\t"
543
        " jb 1b                         \n\t"
544
        : "+r" (i)
545
        : "r"(src), "r"(dst), "r"((long)w-15)
546
    );
547
    for(; i<w; i++)
548
        dst[i+0] += src[i+0];
549
}
550

    
551
#define H263_LOOP_FILTER \
552
        "pxor %%mm7, %%mm7              \n\t"\
553
        "movq  %0, %%mm0                \n\t"\
554
        "movq  %0, %%mm1                \n\t"\
555
        "movq  %3, %%mm2                \n\t"\
556
        "movq  %3, %%mm3                \n\t"\
557
        "punpcklbw %%mm7, %%mm0         \n\t"\
558
        "punpckhbw %%mm7, %%mm1         \n\t"\
559
        "punpcklbw %%mm7, %%mm2         \n\t"\
560
        "punpckhbw %%mm7, %%mm3         \n\t"\
561
        "psubw %%mm2, %%mm0             \n\t"\
562
        "psubw %%mm3, %%mm1             \n\t"\
563
        "movq  %1, %%mm2                \n\t"\
564
        "movq  %1, %%mm3                \n\t"\
565
        "movq  %2, %%mm4                \n\t"\
566
        "movq  %2, %%mm5                \n\t"\
567
        "punpcklbw %%mm7, %%mm2         \n\t"\
568
        "punpckhbw %%mm7, %%mm3         \n\t"\
569
        "punpcklbw %%mm7, %%mm4         \n\t"\
570
        "punpckhbw %%mm7, %%mm5         \n\t"\
571
        "psubw %%mm2, %%mm4             \n\t"\
572
        "psubw %%mm3, %%mm5             \n\t"\
573
        "psllw $2, %%mm4                \n\t"\
574
        "psllw $2, %%mm5                \n\t"\
575
        "paddw %%mm0, %%mm4             \n\t"\
576
        "paddw %%mm1, %%mm5             \n\t"\
577
        "pxor %%mm6, %%mm6              \n\t"\
578
        "pcmpgtw %%mm4, %%mm6           \n\t"\
579
        "pcmpgtw %%mm5, %%mm7           \n\t"\
580
        "pxor %%mm6, %%mm4              \n\t"\
581
        "pxor %%mm7, %%mm5              \n\t"\
582
        "psubw %%mm6, %%mm4             \n\t"\
583
        "psubw %%mm7, %%mm5             \n\t"\
584
        "psrlw $3, %%mm4                \n\t"\
585
        "psrlw $3, %%mm5                \n\t"\
586
        "packuswb %%mm5, %%mm4          \n\t"\
587
        "packsswb %%mm7, %%mm6          \n\t"\
588
        "pxor %%mm7, %%mm7              \n\t"\
589
        "movd %4, %%mm2                 \n\t"\
590
        "punpcklbw %%mm2, %%mm2         \n\t"\
591
        "punpcklbw %%mm2, %%mm2         \n\t"\
592
        "punpcklbw %%mm2, %%mm2         \n\t"\
593
        "psubusb %%mm4, %%mm2           \n\t"\
594
        "movq %%mm2, %%mm3              \n\t"\
595
        "psubusb %%mm4, %%mm3           \n\t"\
596
        "psubb %%mm3, %%mm2             \n\t"\
597
        "movq %1, %%mm3                 \n\t"\
598
        "movq %2, %%mm4                 \n\t"\
599
        "pxor %%mm6, %%mm3              \n\t"\
600
        "pxor %%mm6, %%mm4              \n\t"\
601
        "paddusb %%mm2, %%mm3           \n\t"\
602
        "psubusb %%mm2, %%mm4           \n\t"\
603
        "pxor %%mm6, %%mm3              \n\t"\
604
        "pxor %%mm6, %%mm4              \n\t"\
605
        "paddusb %%mm2, %%mm2           \n\t"\
606
        "packsswb %%mm1, %%mm0          \n\t"\
607
        "pcmpgtb %%mm0, %%mm7           \n\t"\
608
        "pxor %%mm7, %%mm0              \n\t"\
609
        "psubb %%mm7, %%mm0             \n\t"\
610
        "movq %%mm0, %%mm1              \n\t"\
611
        "psubusb %%mm2, %%mm0           \n\t"\
612
        "psubb %%mm0, %%mm1             \n\t"\
613
        "pand %5, %%mm1                 \n\t"\
614
        "psrlw $2, %%mm1                \n\t"\
615
        "pxor %%mm7, %%mm1              \n\t"\
616
        "psubb %%mm7, %%mm1             \n\t"\
617
        "movq %0, %%mm5                 \n\t"\
618
        "movq %3, %%mm6                 \n\t"\
619
        "psubb %%mm1, %%mm5             \n\t"\
620
        "paddb %%mm1, %%mm6             \n\t"
621

    
622
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
623
    const int strength= ff_h263_loop_filter_strength[qscale];
624

    
625
    asm volatile(
626

    
627
        H263_LOOP_FILTER
628

    
629
        "movq %%mm3, %1                 \n\t"
630
        "movq %%mm4, %2                 \n\t"
631
        "movq %%mm5, %0                 \n\t"
632
        "movq %%mm6, %3                 \n\t"
633
        : "+m" (*(uint64_t*)(src - 2*stride)),
634
          "+m" (*(uint64_t*)(src - 1*stride)),
635
          "+m" (*(uint64_t*)(src + 0*stride)),
636
          "+m" (*(uint64_t*)(src + 1*stride))
637
        : "g" (2*strength), "m"(ff_pb_FC)
638
    );
639
}
640

    
641
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
642
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
643
        "movd  %4, %%mm0                \n\t"
644
        "movd  %5, %%mm1                \n\t"
645
        "movd  %6, %%mm2                \n\t"
646
        "movd  %7, %%mm3                \n\t"
647
        "punpcklbw %%mm1, %%mm0         \n\t"
648
        "punpcklbw %%mm3, %%mm2         \n\t"
649
        "movq %%mm0, %%mm1              \n\t"
650
        "punpcklwd %%mm2, %%mm0         \n\t"
651
        "punpckhwd %%mm2, %%mm1         \n\t"
652
        "movd  %%mm0, %0                \n\t"
653
        "punpckhdq %%mm0, %%mm0         \n\t"
654
        "movd  %%mm0, %1                \n\t"
655
        "movd  %%mm1, %2                \n\t"
656
        "punpckhdq %%mm1, %%mm1         \n\t"
657
        "movd  %%mm1, %3                \n\t"
658

    
659
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
660
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
661
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
662
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
663
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
664
           "m" (*(uint32_t*)(src + 1*src_stride)),
665
           "m" (*(uint32_t*)(src + 2*src_stride)),
666
           "m" (*(uint32_t*)(src + 3*src_stride))
667
    );
668
}
669

    
670
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
671
    const int strength= ff_h263_loop_filter_strength[qscale];
672
    uint64_t temp[4] __attribute__ ((aligned(8)));
673
    uint8_t *btemp= (uint8_t*)temp;
674

    
675
    src -= 2;
676

    
677
    transpose4x4(btemp  , src           , 8, stride);
678
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
679
    asm volatile(
680
        H263_LOOP_FILTER // 5 3 4 6
681

    
682
        : "+m" (temp[0]),
683
          "+m" (temp[1]),
684
          "+m" (temp[2]),
685
          "+m" (temp[3])
686
        : "g" (2*strength), "m"(ff_pb_FC)
687
    );
688

    
689
    asm volatile(
690
        "movq %%mm5, %%mm1              \n\t"
691
        "movq %%mm4, %%mm0              \n\t"
692
        "punpcklbw %%mm3, %%mm5         \n\t"
693
        "punpcklbw %%mm6, %%mm4         \n\t"
694
        "punpckhbw %%mm3, %%mm1         \n\t"
695
        "punpckhbw %%mm6, %%mm0         \n\t"
696
        "movq %%mm5, %%mm3              \n\t"
697
        "movq %%mm1, %%mm6              \n\t"
698
        "punpcklwd %%mm4, %%mm5         \n\t"
699
        "punpcklwd %%mm0, %%mm1         \n\t"
700
        "punpckhwd %%mm4, %%mm3         \n\t"
701
        "punpckhwd %%mm0, %%mm6         \n\t"
702
        "movd %%mm5, (%0)               \n\t"
703
        "punpckhdq %%mm5, %%mm5         \n\t"
704
        "movd %%mm5, (%0,%2)            \n\t"
705
        "movd %%mm3, (%0,%2,2)          \n\t"
706
        "punpckhdq %%mm3, %%mm3         \n\t"
707
        "movd %%mm3, (%0,%3)            \n\t"
708
        "movd %%mm1, (%1)               \n\t"
709
        "punpckhdq %%mm1, %%mm1         \n\t"
710
        "movd %%mm1, (%1,%2)            \n\t"
711
        "movd %%mm6, (%1,%2,2)          \n\t"
712
        "punpckhdq %%mm6, %%mm6         \n\t"
713
        "movd %%mm6, (%1,%3)            \n\t"
714
        :: "r" (src),
715
           "r" (src + 4*stride),
716
           "r" ((long)   stride ),
717
           "r" ((long)(3*stride))
718
    );
719
}
720

    
721
#ifdef CONFIG_ENCODERS
722
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
723
    int tmp;
724
  asm volatile (
725
      "movl $16,%%ecx\n"
726
      "pxor %%mm0,%%mm0\n"
727
      "pxor %%mm7,%%mm7\n"
728
      "1:\n"
729
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
730
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
731

    
732
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
733

    
734
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
735
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
736

    
737
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
738
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
739
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
740

    
741
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
742
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
743

    
744
      "pmaddwd %%mm3,%%mm3\n"
745
      "pmaddwd %%mm4,%%mm4\n"
746

    
747
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
748
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
749
      "paddd %%mm3,%%mm4\n"
750
      "paddd %%mm2,%%mm7\n"
751

    
752
      "add %2, %0\n"
753
      "paddd %%mm4,%%mm7\n"
754
      "dec %%ecx\n"
755
      "jnz 1b\n"
756

    
757
      "movq %%mm7,%%mm1\n"
758
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
759
      "paddd %%mm7,%%mm1\n"
760
      "movd %%mm1,%1\n"
761
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
762
    return tmp;
763
}
764

    
765
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
766
    int tmp;
767
  asm volatile (
768
      "movl %4,%%ecx\n"
769
      "shr $1,%%ecx\n"
770
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
771
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
772
      "1:\n"
773
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
774
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
775
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
776
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
777

    
778
      /* todo: mm1-mm2, mm3-mm4 */
779
      /* algo: substract mm1 from mm2 with saturation and vice versa */
780
      /*       OR the results to get absolute difference */
781
      "movq %%mm1,%%mm5\n"
782
      "movq %%mm3,%%mm6\n"
783
      "psubusb %%mm2,%%mm1\n"
784
      "psubusb %%mm4,%%mm3\n"
785
      "psubusb %%mm5,%%mm2\n"
786
      "psubusb %%mm6,%%mm4\n"
787

    
788
      "por %%mm1,%%mm2\n"
789
      "por %%mm3,%%mm4\n"
790

    
791
      /* now convert to 16-bit vectors so we can square them */
792
      "movq %%mm2,%%mm1\n"
793
      "movq %%mm4,%%mm3\n"
794

    
795
      "punpckhbw %%mm0,%%mm2\n"
796
      "punpckhbw %%mm0,%%mm4\n"
797
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
798
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
799

    
800
      "pmaddwd %%mm2,%%mm2\n"
801
      "pmaddwd %%mm4,%%mm4\n"
802
      "pmaddwd %%mm1,%%mm1\n"
803
      "pmaddwd %%mm3,%%mm3\n"
804

    
805
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
806
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
807

    
808
      "paddd %%mm2,%%mm1\n"
809
      "paddd %%mm4,%%mm3\n"
810
      "paddd %%mm1,%%mm7\n"
811
      "paddd %%mm3,%%mm7\n"
812

    
813
      "decl %%ecx\n"
814
      "jnz 1b\n"
815

    
816
      "movq %%mm7,%%mm1\n"
817
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
818
      "paddd %%mm7,%%mm1\n"
819
      "movd %%mm1,%2\n"
820
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
821
      : "r" ((long)line_size) , "m" (h)
822
      : "%ecx");
823
    return tmp;
824
}
825

    
826
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
827
    int tmp;
828
  asm volatile (
829
      "movl %4,%%ecx\n"
830
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
831
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
832
      "1:\n"
833
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
834
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
835
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
836
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
837

    
838
      /* todo: mm1-mm2, mm3-mm4 */
839
      /* algo: substract mm1 from mm2 with saturation and vice versa */
840
      /*       OR the results to get absolute difference */
841
      "movq %%mm1,%%mm5\n"
842
      "movq %%mm3,%%mm6\n"
843
      "psubusb %%mm2,%%mm1\n"
844
      "psubusb %%mm4,%%mm3\n"
845
      "psubusb %%mm5,%%mm2\n"
846
      "psubusb %%mm6,%%mm4\n"
847

    
848
      "por %%mm1,%%mm2\n"
849
      "por %%mm3,%%mm4\n"
850

    
851
      /* now convert to 16-bit vectors so we can square them */
852
      "movq %%mm2,%%mm1\n"
853
      "movq %%mm4,%%mm3\n"
854

    
855
      "punpckhbw %%mm0,%%mm2\n"
856
      "punpckhbw %%mm0,%%mm4\n"
857
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
858
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
859

    
860
      "pmaddwd %%mm2,%%mm2\n"
861
      "pmaddwd %%mm4,%%mm4\n"
862
      "pmaddwd %%mm1,%%mm1\n"
863
      "pmaddwd %%mm3,%%mm3\n"
864

    
865
      "add %3,%0\n"
866
      "add %3,%1\n"
867

    
868
      "paddd %%mm2,%%mm1\n"
869
      "paddd %%mm4,%%mm3\n"
870
      "paddd %%mm1,%%mm7\n"
871
      "paddd %%mm3,%%mm7\n"
872

    
873
      "decl %%ecx\n"
874
      "jnz 1b\n"
875

    
876
      "movq %%mm7,%%mm1\n"
877
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
878
      "paddd %%mm7,%%mm1\n"
879
      "movd %%mm1,%2\n"
880
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
881
      : "r" ((long)line_size) , "m" (h)
882
      : "%ecx");
883
    return tmp;
884
}
885

    
886
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
887
    int tmp;
888
  asm volatile (
889
      "shr $1,%2\n"
890
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
891
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
892
      "1:\n"
893
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
894
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
895
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
896
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
897

    
898
      /* todo: mm1-mm2, mm3-mm4 */
899
      /* algo: substract mm1 from mm2 with saturation and vice versa */
900
      /*       OR the results to get absolute difference */
901
      "movdqa %%xmm1,%%xmm5\n"
902
      "movdqa %%xmm3,%%xmm6\n"
903
      "psubusb %%xmm2,%%xmm1\n"
904
      "psubusb %%xmm4,%%xmm3\n"
905
      "psubusb %%xmm5,%%xmm2\n"
906
      "psubusb %%xmm6,%%xmm4\n"
907

    
908
      "por %%xmm1,%%xmm2\n"
909
      "por %%xmm3,%%xmm4\n"
910

    
911
      /* now convert to 16-bit vectors so we can square them */
912
      "movdqa %%xmm2,%%xmm1\n"
913
      "movdqa %%xmm4,%%xmm3\n"
914

    
915
      "punpckhbw %%xmm0,%%xmm2\n"
916
      "punpckhbw %%xmm0,%%xmm4\n"
917
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
918
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
919

    
920
      "pmaddwd %%xmm2,%%xmm2\n"
921
      "pmaddwd %%xmm4,%%xmm4\n"
922
      "pmaddwd %%xmm1,%%xmm1\n"
923
      "pmaddwd %%xmm3,%%xmm3\n"
924

    
925
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
926
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
927

    
928
      "paddd %%xmm2,%%xmm1\n"
929
      "paddd %%xmm4,%%xmm3\n"
930
      "paddd %%xmm1,%%xmm7\n"
931
      "paddd %%xmm3,%%xmm7\n"
932

    
933
      "decl %2\n"
934
      "jnz 1b\n"
935

    
936
      "movdqa %%xmm7,%%xmm1\n"
937
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
938
      "paddd %%xmm1,%%xmm7\n"
939
      "movdqa %%xmm7,%%xmm1\n"
940
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
941
      "paddd %%xmm1,%%xmm7\n"
942
      "movd %%xmm7,%3\n"
943
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
944
      : "r" ((long)line_size));
945
    return tmp;
946
}
947

    
948
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
949
    int tmp;
950
  asm volatile (
951
      "movl %3,%%ecx\n"
952
      "pxor %%mm7,%%mm7\n"
953
      "pxor %%mm6,%%mm6\n"
954

    
955
      "movq (%0),%%mm0\n"
956
      "movq %%mm0, %%mm1\n"
957
      "psllq $8, %%mm0\n"
958
      "psrlq $8, %%mm1\n"
959
      "psrlq $8, %%mm0\n"
960
      "movq %%mm0, %%mm2\n"
961
      "movq %%mm1, %%mm3\n"
962
      "punpcklbw %%mm7,%%mm0\n"
963
      "punpcklbw %%mm7,%%mm1\n"
964
      "punpckhbw %%mm7,%%mm2\n"
965
      "punpckhbw %%mm7,%%mm3\n"
966
      "psubw %%mm1, %%mm0\n"
967
      "psubw %%mm3, %%mm2\n"
968

    
969
      "add %2,%0\n"
970

    
971
      "movq (%0),%%mm4\n"
972
      "movq %%mm4, %%mm1\n"
973
      "psllq $8, %%mm4\n"
974
      "psrlq $8, %%mm1\n"
975
      "psrlq $8, %%mm4\n"
976
      "movq %%mm4, %%mm5\n"
977
      "movq %%mm1, %%mm3\n"
978
      "punpcklbw %%mm7,%%mm4\n"
979
      "punpcklbw %%mm7,%%mm1\n"
980
      "punpckhbw %%mm7,%%mm5\n"
981
      "punpckhbw %%mm7,%%mm3\n"
982
      "psubw %%mm1, %%mm4\n"
983
      "psubw %%mm3, %%mm5\n"
984
      "psubw %%mm4, %%mm0\n"
985
      "psubw %%mm5, %%mm2\n"
986
      "pxor %%mm3, %%mm3\n"
987
      "pxor %%mm1, %%mm1\n"
988
      "pcmpgtw %%mm0, %%mm3\n\t"
989
      "pcmpgtw %%mm2, %%mm1\n\t"
990
      "pxor %%mm3, %%mm0\n"
991
      "pxor %%mm1, %%mm2\n"
992
      "psubw %%mm3, %%mm0\n"
993
      "psubw %%mm1, %%mm2\n"
994
      "paddw %%mm0, %%mm2\n"
995
      "paddw %%mm2, %%mm6\n"
996

    
997
      "add %2,%0\n"
998
      "1:\n"
999

    
1000
      "movq (%0),%%mm0\n"
1001
      "movq %%mm0, %%mm1\n"
1002
      "psllq $8, %%mm0\n"
1003
      "psrlq $8, %%mm1\n"
1004
      "psrlq $8, %%mm0\n"
1005
      "movq %%mm0, %%mm2\n"
1006
      "movq %%mm1, %%mm3\n"
1007
      "punpcklbw %%mm7,%%mm0\n"
1008
      "punpcklbw %%mm7,%%mm1\n"
1009
      "punpckhbw %%mm7,%%mm2\n"
1010
      "punpckhbw %%mm7,%%mm3\n"
1011
      "psubw %%mm1, %%mm0\n"
1012
      "psubw %%mm3, %%mm2\n"
1013
      "psubw %%mm0, %%mm4\n"
1014
      "psubw %%mm2, %%mm5\n"
1015
      "pxor %%mm3, %%mm3\n"
1016
      "pxor %%mm1, %%mm1\n"
1017
      "pcmpgtw %%mm4, %%mm3\n\t"
1018
      "pcmpgtw %%mm5, %%mm1\n\t"
1019
      "pxor %%mm3, %%mm4\n"
1020
      "pxor %%mm1, %%mm5\n"
1021
      "psubw %%mm3, %%mm4\n"
1022
      "psubw %%mm1, %%mm5\n"
1023
      "paddw %%mm4, %%mm5\n"
1024
      "paddw %%mm5, %%mm6\n"
1025

    
1026
      "add %2,%0\n"
1027

    
1028
      "movq (%0),%%mm4\n"
1029
      "movq %%mm4, %%mm1\n"
1030
      "psllq $8, %%mm4\n"
1031
      "psrlq $8, %%mm1\n"
1032
      "psrlq $8, %%mm4\n"
1033
      "movq %%mm4, %%mm5\n"
1034
      "movq %%mm1, %%mm3\n"
1035
      "punpcklbw %%mm7,%%mm4\n"
1036
      "punpcklbw %%mm7,%%mm1\n"
1037
      "punpckhbw %%mm7,%%mm5\n"
1038
      "punpckhbw %%mm7,%%mm3\n"
1039
      "psubw %%mm1, %%mm4\n"
1040
      "psubw %%mm3, %%mm5\n"
1041
      "psubw %%mm4, %%mm0\n"
1042
      "psubw %%mm5, %%mm2\n"
1043
      "pxor %%mm3, %%mm3\n"
1044
      "pxor %%mm1, %%mm1\n"
1045
      "pcmpgtw %%mm0, %%mm3\n\t"
1046
      "pcmpgtw %%mm2, %%mm1\n\t"
1047
      "pxor %%mm3, %%mm0\n"
1048
      "pxor %%mm1, %%mm2\n"
1049
      "psubw %%mm3, %%mm0\n"
1050
      "psubw %%mm1, %%mm2\n"
1051
      "paddw %%mm0, %%mm2\n"
1052
      "paddw %%mm2, %%mm6\n"
1053

    
1054
      "add %2,%0\n"
1055
      "subl $2, %%ecx\n"
1056
      " jnz 1b\n"
1057

    
1058
      "movq %%mm6, %%mm0\n"
1059
      "punpcklwd %%mm7,%%mm0\n"
1060
      "punpckhwd %%mm7,%%mm6\n"
1061
      "paddd %%mm0, %%mm6\n"
1062

    
1063
      "movq %%mm6,%%mm0\n"
1064
      "psrlq $32, %%mm6\n"
1065
      "paddd %%mm6,%%mm0\n"
1066
      "movd %%mm0,%1\n"
1067
      : "+r" (pix1), "=r"(tmp)
1068
      : "r" ((long)line_size) , "g" (h-2)
1069
      : "%ecx");
1070
      return tmp;
1071
}
1072

    
1073
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1074
    int tmp;
1075
    uint8_t * pix= pix1;
1076
  asm volatile (
1077
      "movl %3,%%ecx\n"
1078
      "pxor %%mm7,%%mm7\n"
1079
      "pxor %%mm6,%%mm6\n"
1080

    
1081
      "movq (%0),%%mm0\n"
1082
      "movq 1(%0),%%mm1\n"
1083
      "movq %%mm0, %%mm2\n"
1084
      "movq %%mm1, %%mm3\n"
1085
      "punpcklbw %%mm7,%%mm0\n"
1086
      "punpcklbw %%mm7,%%mm1\n"
1087
      "punpckhbw %%mm7,%%mm2\n"
1088
      "punpckhbw %%mm7,%%mm3\n"
1089
      "psubw %%mm1, %%mm0\n"
1090
      "psubw %%mm3, %%mm2\n"
1091

    
1092
      "add %2,%0\n"
1093

    
1094
      "movq (%0),%%mm4\n"
1095
      "movq 1(%0),%%mm1\n"
1096
      "movq %%mm4, %%mm5\n"
1097
      "movq %%mm1, %%mm3\n"
1098
      "punpcklbw %%mm7,%%mm4\n"
1099
      "punpcklbw %%mm7,%%mm1\n"
1100
      "punpckhbw %%mm7,%%mm5\n"
1101
      "punpckhbw %%mm7,%%mm3\n"
1102
      "psubw %%mm1, %%mm4\n"
1103
      "psubw %%mm3, %%mm5\n"
1104
      "psubw %%mm4, %%mm0\n"
1105
      "psubw %%mm5, %%mm2\n"
1106
      "pxor %%mm3, %%mm3\n"
1107
      "pxor %%mm1, %%mm1\n"
1108
      "pcmpgtw %%mm0, %%mm3\n\t"
1109
      "pcmpgtw %%mm2, %%mm1\n\t"
1110
      "pxor %%mm3, %%mm0\n"
1111
      "pxor %%mm1, %%mm2\n"
1112
      "psubw %%mm3, %%mm0\n"
1113
      "psubw %%mm1, %%mm2\n"
1114
      "paddw %%mm0, %%mm2\n"
1115
      "paddw %%mm2, %%mm6\n"
1116

    
1117
      "add %2,%0\n"
1118
      "1:\n"
1119

    
1120
      "movq (%0),%%mm0\n"
1121
      "movq 1(%0),%%mm1\n"
1122
      "movq %%mm0, %%mm2\n"
1123
      "movq %%mm1, %%mm3\n"
1124
      "punpcklbw %%mm7,%%mm0\n"
1125
      "punpcklbw %%mm7,%%mm1\n"
1126
      "punpckhbw %%mm7,%%mm2\n"
1127
      "punpckhbw %%mm7,%%mm3\n"
1128
      "psubw %%mm1, %%mm0\n"
1129
      "psubw %%mm3, %%mm2\n"
1130
      "psubw %%mm0, %%mm4\n"
1131
      "psubw %%mm2, %%mm5\n"
1132
      "pxor %%mm3, %%mm3\n"
1133
      "pxor %%mm1, %%mm1\n"
1134
      "pcmpgtw %%mm4, %%mm3\n\t"
1135
      "pcmpgtw %%mm5, %%mm1\n\t"
1136
      "pxor %%mm3, %%mm4\n"
1137
      "pxor %%mm1, %%mm5\n"
1138
      "psubw %%mm3, %%mm4\n"
1139
      "psubw %%mm1, %%mm5\n"
1140
      "paddw %%mm4, %%mm5\n"
1141
      "paddw %%mm5, %%mm6\n"
1142

    
1143
      "add %2,%0\n"
1144

    
1145
      "movq (%0),%%mm4\n"
1146
      "movq 1(%0),%%mm1\n"
1147
      "movq %%mm4, %%mm5\n"
1148
      "movq %%mm1, %%mm3\n"
1149
      "punpcklbw %%mm7,%%mm4\n"
1150
      "punpcklbw %%mm7,%%mm1\n"
1151
      "punpckhbw %%mm7,%%mm5\n"
1152
      "punpckhbw %%mm7,%%mm3\n"
1153
      "psubw %%mm1, %%mm4\n"
1154
      "psubw %%mm3, %%mm5\n"
1155
      "psubw %%mm4, %%mm0\n"
1156
      "psubw %%mm5, %%mm2\n"
1157
      "pxor %%mm3, %%mm3\n"
1158
      "pxor %%mm1, %%mm1\n"
1159
      "pcmpgtw %%mm0, %%mm3\n\t"
1160
      "pcmpgtw %%mm2, %%mm1\n\t"
1161
      "pxor %%mm3, %%mm0\n"
1162
      "pxor %%mm1, %%mm2\n"
1163
      "psubw %%mm3, %%mm0\n"
1164
      "psubw %%mm1, %%mm2\n"
1165
      "paddw %%mm0, %%mm2\n"
1166
      "paddw %%mm2, %%mm6\n"
1167

    
1168
      "add %2,%0\n"
1169
      "subl $2, %%ecx\n"
1170
      " jnz 1b\n"
1171

    
1172
      "movq %%mm6, %%mm0\n"
1173
      "punpcklwd %%mm7,%%mm0\n"
1174
      "punpckhwd %%mm7,%%mm6\n"
1175
      "paddd %%mm0, %%mm6\n"
1176

    
1177
      "movq %%mm6,%%mm0\n"
1178
      "psrlq $32, %%mm6\n"
1179
      "paddd %%mm6,%%mm0\n"
1180
      "movd %%mm0,%1\n"
1181
      : "+r" (pix1), "=r"(tmp)
1182
      : "r" ((long)line_size) , "g" (h-2)
1183
      : "%ecx");
1184
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1185
}
1186

    
1187
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1188
    MpegEncContext *c = p;
1189
    int score1, score2;
1190

    
1191
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1192
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1193
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1194

    
1195
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1196
    else  return score1 + FFABS(score2)*8;
1197
}
1198

    
1199
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1200
    MpegEncContext *c = p;
1201
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1202
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1203

    
1204
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1205
    else  return score1 + FFABS(score2)*8;
1206
}
1207

    
1208
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1209
    int tmp;
1210

    
1211
    assert( (((int)pix) & 7) == 0);
1212
    assert((line_size &7) ==0);
1213

    
1214
#define SUM(in0, in1, out0, out1) \
1215
      "movq (%0), %%mm2\n"\
1216
      "movq 8(%0), %%mm3\n"\
1217
      "add %2,%0\n"\
1218
      "movq %%mm2, " #out0 "\n"\
1219
      "movq %%mm3, " #out1 "\n"\
1220
      "psubusb " #in0 ", %%mm2\n"\
1221
      "psubusb " #in1 ", %%mm3\n"\
1222
      "psubusb " #out0 ", " #in0 "\n"\
1223
      "psubusb " #out1 ", " #in1 "\n"\
1224
      "por %%mm2, " #in0 "\n"\
1225
      "por %%mm3, " #in1 "\n"\
1226
      "movq " #in0 ", %%mm2\n"\
1227
      "movq " #in1 ", %%mm3\n"\
1228
      "punpcklbw %%mm7, " #in0 "\n"\
1229
      "punpcklbw %%mm7, " #in1 "\n"\
1230
      "punpckhbw %%mm7, %%mm2\n"\
1231
      "punpckhbw %%mm7, %%mm3\n"\
1232
      "paddw " #in1 ", " #in0 "\n"\
1233
      "paddw %%mm3, %%mm2\n"\
1234
      "paddw %%mm2, " #in0 "\n"\
1235
      "paddw " #in0 ", %%mm6\n"
1236

    
1237

    
1238
  asm volatile (
1239
      "movl %3,%%ecx\n"
1240
      "pxor %%mm6,%%mm6\n"
1241
      "pxor %%mm7,%%mm7\n"
1242
      "movq (%0),%%mm0\n"
1243
      "movq 8(%0),%%mm1\n"
1244
      "add %2,%0\n"
1245
      "subl $2, %%ecx\n"
1246
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1247
      "1:\n"
1248

    
1249
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1250

    
1251
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1252

    
1253
      "subl $2, %%ecx\n"
1254
      "jnz 1b\n"
1255

    
1256
      "movq %%mm6,%%mm0\n"
1257
      "psrlq $32, %%mm6\n"
1258
      "paddw %%mm6,%%mm0\n"
1259
      "movq %%mm0,%%mm6\n"
1260
      "psrlq $16, %%mm0\n"
1261
      "paddw %%mm6,%%mm0\n"
1262
      "movd %%mm0,%1\n"
1263
      : "+r" (pix), "=r"(tmp)
1264
      : "r" ((long)line_size) , "m" (h)
1265
      : "%ecx");
1266
    return tmp & 0xFFFF;
1267
}
1268
#undef SUM
1269

    
1270
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1271
    int tmp;
1272

    
1273
    assert( (((int)pix) & 7) == 0);
1274
    assert((line_size &7) ==0);
1275

    
1276
#define SUM(in0, in1, out0, out1) \
1277
      "movq (%0), " #out0 "\n"\
1278
      "movq 8(%0), " #out1 "\n"\
1279
      "add %2,%0\n"\
1280
      "psadbw " #out0 ", " #in0 "\n"\
1281
      "psadbw " #out1 ", " #in1 "\n"\
1282
      "paddw " #in1 ", " #in0 "\n"\
1283
      "paddw " #in0 ", %%mm6\n"
1284

    
1285
  asm volatile (
1286
      "movl %3,%%ecx\n"
1287
      "pxor %%mm6,%%mm6\n"
1288
      "pxor %%mm7,%%mm7\n"
1289
      "movq (%0),%%mm0\n"
1290
      "movq 8(%0),%%mm1\n"
1291
      "add %2,%0\n"
1292
      "subl $2, %%ecx\n"
1293
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1294
      "1:\n"
1295

    
1296
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1297

    
1298
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1299

    
1300
      "subl $2, %%ecx\n"
1301
      "jnz 1b\n"
1302

    
1303
      "movd %%mm6,%1\n"
1304
      : "+r" (pix), "=r"(tmp)
1305
      : "r" ((long)line_size) , "m" (h)
1306
      : "%ecx");
1307
    return tmp;
1308
}
1309
#undef SUM
1310

    
1311
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1312
    int tmp;
1313

    
1314
    assert( (((int)pix1) & 7) == 0);
1315
    assert( (((int)pix2) & 7) == 0);
1316
    assert((line_size &7) ==0);
1317

    
1318
#define SUM(in0, in1, out0, out1) \
1319
      "movq (%0),%%mm2\n"\
1320
      "movq (%1)," #out0 "\n"\
1321
      "movq 8(%0),%%mm3\n"\
1322
      "movq 8(%1)," #out1 "\n"\
1323
      "add %3,%0\n"\
1324
      "add %3,%1\n"\
1325
      "psubb " #out0 ", %%mm2\n"\
1326
      "psubb " #out1 ", %%mm3\n"\
1327
      "pxor %%mm7, %%mm2\n"\
1328
      "pxor %%mm7, %%mm3\n"\
1329
      "movq %%mm2, " #out0 "\n"\
1330
      "movq %%mm3, " #out1 "\n"\
1331
      "psubusb " #in0 ", %%mm2\n"\
1332
      "psubusb " #in1 ", %%mm3\n"\
1333
      "psubusb " #out0 ", " #in0 "\n"\
1334
      "psubusb " #out1 ", " #in1 "\n"\
1335
      "por %%mm2, " #in0 "\n"\
1336
      "por %%mm3, " #in1 "\n"\
1337
      "movq " #in0 ", %%mm2\n"\
1338
      "movq " #in1 ", %%mm3\n"\
1339
      "punpcklbw %%mm7, " #in0 "\n"\
1340
      "punpcklbw %%mm7, " #in1 "\n"\
1341
      "punpckhbw %%mm7, %%mm2\n"\
1342
      "punpckhbw %%mm7, %%mm3\n"\
1343
      "paddw " #in1 ", " #in0 "\n"\
1344
      "paddw %%mm3, %%mm2\n"\
1345
      "paddw %%mm2, " #in0 "\n"\
1346
      "paddw " #in0 ", %%mm6\n"
1347

    
1348

    
1349
  asm volatile (
1350
      "movl %4,%%ecx\n"
1351
      "pxor %%mm6,%%mm6\n"
1352
      "pcmpeqw %%mm7,%%mm7\n"
1353
      "psllw $15, %%mm7\n"
1354
      "packsswb %%mm7, %%mm7\n"
1355
      "movq (%0),%%mm0\n"
1356
      "movq (%1),%%mm2\n"
1357
      "movq 8(%0),%%mm1\n"
1358
      "movq 8(%1),%%mm3\n"
1359
      "add %3,%0\n"
1360
      "add %3,%1\n"
1361
      "subl $2, %%ecx\n"
1362
      "psubb %%mm2, %%mm0\n"
1363
      "psubb %%mm3, %%mm1\n"
1364
      "pxor %%mm7, %%mm0\n"
1365
      "pxor %%mm7, %%mm1\n"
1366
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1367
      "1:\n"
1368

    
1369
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1370

    
1371
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1372

    
1373
      "subl $2, %%ecx\n"
1374
      "jnz 1b\n"
1375

    
1376
      "movq %%mm6,%%mm0\n"
1377
      "psrlq $32, %%mm6\n"
1378
      "paddw %%mm6,%%mm0\n"
1379
      "movq %%mm0,%%mm6\n"
1380
      "psrlq $16, %%mm0\n"
1381
      "paddw %%mm6,%%mm0\n"
1382
      "movd %%mm0,%2\n"
1383
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1384
      : "r" ((long)line_size) , "m" (h)
1385
      : "%ecx");
1386
    return tmp & 0x7FFF;
1387
}
1388
#undef SUM
1389

    
1390
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1391
    int tmp;
1392

    
1393
    assert( (((int)pix1) & 7) == 0);
1394
    assert( (((int)pix2) & 7) == 0);
1395
    assert((line_size &7) ==0);
1396

    
1397
#define SUM(in0, in1, out0, out1) \
1398
      "movq (%0)," #out0 "\n"\
1399
      "movq (%1),%%mm2\n"\
1400
      "movq 8(%0)," #out1 "\n"\
1401
      "movq 8(%1),%%mm3\n"\
1402
      "add %3,%0\n"\
1403
      "add %3,%1\n"\
1404
      "psubb %%mm2, " #out0 "\n"\
1405
      "psubb %%mm3, " #out1 "\n"\
1406
      "pxor %%mm7, " #out0 "\n"\
1407
      "pxor %%mm7, " #out1 "\n"\
1408
      "psadbw " #out0 ", " #in0 "\n"\
1409
      "psadbw " #out1 ", " #in1 "\n"\
1410
      "paddw " #in1 ", " #in0 "\n"\
1411
      "paddw " #in0 ", %%mm6\n"
1412

    
1413
  asm volatile (
1414
      "movl %4,%%ecx\n"
1415
      "pxor %%mm6,%%mm6\n"
1416
      "pcmpeqw %%mm7,%%mm7\n"
1417
      "psllw $15, %%mm7\n"
1418
      "packsswb %%mm7, %%mm7\n"
1419
      "movq (%0),%%mm0\n"
1420
      "movq (%1),%%mm2\n"
1421
      "movq 8(%0),%%mm1\n"
1422
      "movq 8(%1),%%mm3\n"
1423
      "add %3,%0\n"
1424
      "add %3,%1\n"
1425
      "subl $2, %%ecx\n"
1426
      "psubb %%mm2, %%mm0\n"
1427
      "psubb %%mm3, %%mm1\n"
1428
      "pxor %%mm7, %%mm0\n"
1429
      "pxor %%mm7, %%mm1\n"
1430
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1431
      "1:\n"
1432

    
1433
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1434

    
1435
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1436

    
1437
      "subl $2, %%ecx\n"
1438
      "jnz 1b\n"
1439

    
1440
      "movd %%mm6,%2\n"
1441
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1442
      : "r" ((long)line_size) , "m" (h)
1443
      : "%ecx");
1444
    return tmp;
1445
}
1446
#undef SUM
1447

    
1448
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1449
    long i=0;
1450
    asm volatile(
1451
        "1:                             \n\t"
1452
        "movq  (%2, %0), %%mm0          \n\t"
1453
        "movq  (%1, %0), %%mm1          \n\t"
1454
        "psubb %%mm0, %%mm1             \n\t"
1455
        "movq %%mm1, (%3, %0)           \n\t"
1456
        "movq 8(%2, %0), %%mm0          \n\t"
1457
        "movq 8(%1, %0), %%mm1          \n\t"
1458
        "psubb %%mm0, %%mm1             \n\t"
1459
        "movq %%mm1, 8(%3, %0)          \n\t"
1460
        "add $16, %0                    \n\t"
1461
        "cmp %4, %0                     \n\t"
1462
        " jb 1b                         \n\t"
1463
        : "+r" (i)
1464
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1465
    );
1466
    for(; i<w; i++)
1467
        dst[i+0] = src1[i+0]-src2[i+0];
1468
}
1469

    
1470
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1471
    long i=0;
1472
    uint8_t l, lt;
1473

    
1474
    asm volatile(
1475
        "1:                             \n\t"
1476
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1477
        "movq  (%1, %0), %%mm1          \n\t" // T
1478
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1479
        "movq  (%2, %0), %%mm3          \n\t" // X
1480
        "movq %%mm2, %%mm4              \n\t" // L
1481
        "psubb %%mm0, %%mm2             \n\t"
1482
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1483
        "movq %%mm4, %%mm5              \n\t" // L
1484
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1485
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1486
        "pminub %%mm2, %%mm4            \n\t"
1487
        "pmaxub %%mm1, %%mm4            \n\t"
1488
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1489
        "movq %%mm3, (%3, %0)           \n\t"
1490
        "add $8, %0                     \n\t"
1491
        "cmp %4, %0                     \n\t"
1492
        " jb 1b                         \n\t"
1493
        : "+r" (i)
1494
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1495
    );
1496

    
1497
    l= *left;
1498
    lt= *left_top;
1499

    
1500
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1501

    
1502
    *left_top= src1[w-1];
1503
    *left    = src2[w-1];
1504
}
1505

    
1506
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1507
    "mov"#m" "#p1", "#a"              \n\t"\
1508
    "mov"#m" "#p2", "#t"              \n\t"\
1509
    "punpcklbw "#a", "#t"             \n\t"\
1510
    "punpcklbw "#a", "#a"             \n\t"\
1511
    "psubw     "#t", "#a"             \n\t"\
1512

    
1513
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1514
    uint8_t *p1b=p1, *p2b=p2;\
1515
    asm volatile(\
1516
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1517
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1518
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1519
        "add %4, %1                   \n\t"\
1520
        "add %4, %2                   \n\t"\
1521
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1522
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1523
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1524
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1525
        "mov"#m1" "#mm"0, %0          \n\t"\
1526
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1527
        "mov"#m1" %0, "#mm"0          \n\t"\
1528
        : "=m"(temp), "+r"(p1b), "+r"(p2b)\
1529
        : "r"((long)stride), "r"((long)stride*3)\
1530
    );\
1531
}
1532

    
1533
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1534
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1535

    
1536
#ifdef ARCH_X86_64
1537
// permutes 01234567 -> 05736421
1538
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1539
    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1540
    SBUTTERFLY(c,d,b,wd,dqa)\
1541
    SBUTTERFLY(e,f,d,wd,dqa)\
1542
    SBUTTERFLY(g,h,f,wd,dqa)\
1543
    SBUTTERFLY(a,c,h,dq,dqa)\
1544
    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1545
    SBUTTERFLY(e,g,b,dq,dqa)\
1546
    SBUTTERFLY(d,f,g,dq,dqa)\
1547
    SBUTTERFLY(a,e,f,qdq,dqa)\
1548
    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1549
    SBUTTERFLY(h,b,d,qdq,dqa)\
1550
    SBUTTERFLY(c,g,b,qdq,dqa)\
1551
    "movdqa %%xmm8, "#g"              \n\t"
1552
#else
1553
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1554
    "movdqa "#h", "#t"                \n\t"\
1555
    SBUTTERFLY(a,b,h,wd,dqa)\
1556
    "movdqa "#h", 16"#t"              \n\t"\
1557
    "movdqa "#t", "#h"                \n\t"\
1558
    SBUTTERFLY(c,d,b,wd,dqa)\
1559
    SBUTTERFLY(e,f,d,wd,dqa)\
1560
    SBUTTERFLY(g,h,f,wd,dqa)\
1561
    SBUTTERFLY(a,c,h,dq,dqa)\
1562
    "movdqa "#h", "#t"                \n\t"\
1563
    "movdqa 16"#t", "#h"              \n\t"\
1564
    SBUTTERFLY(h,b,c,dq,dqa)\
1565
    SBUTTERFLY(e,g,b,dq,dqa)\
1566
    SBUTTERFLY(d,f,g,dq,dqa)\
1567
    SBUTTERFLY(a,e,f,qdq,dqa)\
1568
    SBUTTERFLY(h,d,e,qdq,dqa)\
1569
    "movdqa "#h", 16"#t"              \n\t"\
1570
    "movdqa "#t", "#h"                \n\t"\
1571
    SBUTTERFLY(h,b,d,qdq,dqa)\
1572
    SBUTTERFLY(c,g,b,qdq,dqa)\
1573
    "movdqa 16"#t", "#g"              \n\t"
1574
#endif
1575

    
1576
#define LBUTTERFLY2(a1,b1,a2,b2)\
1577
    "paddw " #b1 ", " #a1 "           \n\t"\
1578
    "paddw " #b2 ", " #a2 "           \n\t"\
1579
    "paddw " #b1 ", " #b1 "           \n\t"\
1580
    "paddw " #b2 ", " #b2 "           \n\t"\
1581
    "psubw " #a1 ", " #b1 "           \n\t"\
1582
    "psubw " #a2 ", " #b2 "           \n\t"
1583

    
1584
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1585
        LBUTTERFLY2(m0, m1, m2, m3)\
1586
        LBUTTERFLY2(m4, m5, m6, m7)\
1587
        LBUTTERFLY2(m0, m2, m1, m3)\
1588
        LBUTTERFLY2(m4, m6, m5, m7)\
1589
        LBUTTERFLY2(m0, m4, m1, m5)\
1590
        LBUTTERFLY2(m2, m6, m3, m7)\
1591

    
1592
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1593

    
1594
#define MMABS_MMX(a,z)\
1595
    "pxor " #z ", " #z "              \n\t"\
1596
    "pcmpgtw " #a ", " #z "           \n\t"\
1597
    "pxor " #z ", " #a "              \n\t"\
1598
    "psubw " #z ", " #a "             \n\t"
1599

    
1600
#define MMABS_MMX2(a,z)\
1601
    "pxor " #z ", " #z "              \n\t"\
1602
    "psubw " #a ", " #z "             \n\t"\
1603
    "pmaxsw " #z ", " #a "            \n\t"
1604

    
1605
#define MMABS_SSSE3(a,z)\
1606
    "pabsw " #a ", " #a "             \n\t"
1607

    
1608
#define MMABS_SUM(a,z, sum)\
1609
    MMABS(a,z)\
1610
    "paddusw " #a ", " #sum "         \n\t"
1611

    
1612
#define MMABS_SUM_8x8_NOSPILL\
1613
    MMABS(%%xmm0, %%xmm8)\
1614
    MMABS(%%xmm1, %%xmm9)\
1615
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1616
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1617
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1618
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1619
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1620
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1621
    "paddusw %%xmm1, %%xmm0           \n\t"
1622

    
1623
#ifdef ARCH_X86_64
1624
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1625
#else
1626
#define MMABS_SUM_8x8_SSE2\
1627
    "movdqa %%xmm7, (%1)              \n\t"\
1628
    MMABS(%%xmm0, %%xmm7)\
1629
    MMABS(%%xmm1, %%xmm7)\
1630
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1631
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1632
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1633
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1634
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1635
    "movdqa (%1), %%xmm2              \n\t"\
1636
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1637
    "paddusw %%xmm1, %%xmm0           \n\t"
1638
#endif
1639

    
1640
#define LOAD4(o, a, b, c, d)\
1641
    "movq "#o"(%1),    "#a"           \n\t"\
1642
    "movq "#o"+8(%1),  "#b"           \n\t"\
1643
    "movq "#o"+16(%1), "#c"           \n\t"\
1644
    "movq "#o"+24(%1), "#d"           \n\t"\
1645

    
1646
#define STORE4(o, a, b, c, d)\
1647
    "movq "#a", "#o"(%1)              \n\t"\
1648
    "movq "#b", "#o"+8(%1)            \n\t"\
1649
    "movq "#c", "#o"+16(%1)           \n\t"\
1650
    "movq "#d", "#o"+24(%1)           \n\t"\
1651

    
1652
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1653
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1654
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1655
#define HSUM_MMX(a, t, dst)\
1656
    "movq "#a", "#t"                  \n\t"\
1657
    "psrlq $32, "#a"                  \n\t"\
1658
    "paddusw "#t", "#a"               \n\t"\
1659
    "movq "#a", "#t"                  \n\t"\
1660
    "psrlq $16, "#a"                  \n\t"\
1661
    "paddusw "#t", "#a"               \n\t"\
1662
    "movd "#a", "#dst"                \n\t"\
1663

    
1664
#define HSUM_MMX2(a, t, dst)\
1665
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1666
    "paddusw "#t", "#a"               \n\t"\
1667
    "pshufw $0x01, "#a", "#t"         \n\t"\
1668
    "paddusw "#t", "#a"               \n\t"\
1669
    "movd "#a", "#dst"                \n\t"\
1670

    
1671
#define HSUM_SSE2(a, t, dst)\
1672
    "movhlps "#a", "#t"               \n\t"\
1673
    "paddusw "#t", "#a"               \n\t"\
1674
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1675
    "paddusw "#t", "#a"               \n\t"\
1676
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1677
    "paddusw "#t", "#a"               \n\t"\
1678
    "movd "#a", "#dst"                \n\t"\
1679

    
1680
#define HADAMARD8_DIFF_MMX(cpu) \
1681
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1682
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1683
    int sum;\
1684
\
1685
    assert(h==8);\
1686
\
1687
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1688
\
1689
    asm volatile(\
1690
        HADAMARD48\
1691
\
1692
        "movq %%mm7, 96(%1)             \n\t"\
1693
\
1694
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1695
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1696
\
1697
        "movq 96(%1), %%mm7             \n\t"\
1698
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1699
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1700
\
1701
        : "=r" (sum)\
1702
        : "r"(temp)\
1703
    );\
1704
\
1705
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1706
\
1707
    asm volatile(\
1708
        HADAMARD48\
1709
\
1710
        "movq %%mm7, 96(%1)             \n\t"\
1711
\
1712
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1713
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1714
\
1715
        "movq 96(%1), %%mm7             \n\t"\
1716
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1717
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1718
        "movq %%mm6, %%mm7              \n\t"\
1719
        "movq %%mm0, %%mm6              \n\t"\
1720
\
1721
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1722
\
1723
        HADAMARD48\
1724
        "movq %%mm7, 64(%1)             \n\t"\
1725
        MMABS(%%mm0, %%mm7)\
1726
        MMABS(%%mm1, %%mm7)\
1727
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1728
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1729
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1730
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1731
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1732
        "movq 64(%1), %%mm2             \n\t"\
1733
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1734
        "paddusw %%mm1, %%mm0           \n\t"\
1735
        "movq %%mm0, 64(%1)             \n\t"\
1736
\
1737
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1738
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1739
\
1740
        HADAMARD48\
1741
        "movq %%mm7, (%1)               \n\t"\
1742
        MMABS(%%mm0, %%mm7)\
1743
        MMABS(%%mm1, %%mm7)\
1744
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1745
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1746
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1747
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1748
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1749
        "movq (%1), %%mm2               \n\t"\
1750
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1751
        "paddusw 64(%1), %%mm0          \n\t"\
1752
        "paddusw %%mm1, %%mm0           \n\t"\
1753
\
1754
        HSUM(%%mm0, %%mm1, %0)\
1755
\
1756
        : "=r" (sum)\
1757
        : "r"(temp)\
1758
    );\
1759
    return sum&0xFFFF;\
1760
}\
1761
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1762

    
1763
#define HADAMARD8_DIFF_SSE2(cpu) \
1764
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1765
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1766
    int sum;\
1767
\
1768
    assert(h==8);\
1769
\
1770
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1771
\
1772
    asm volatile(\
1773
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1774
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1775
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1776
        MMABS_SUM_8x8\
1777
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1778
        : "=r" (sum)\
1779
        : "r"(temp)\
1780
    );\
1781
    return sum&0xFFFF;\
1782
}\
1783
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1784

    
1785
#define MMABS(a,z)         MMABS_MMX(a,z)
1786
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1787
HADAMARD8_DIFF_MMX(mmx)
1788
#undef MMABS
1789
#undef HSUM
1790

    
1791
#define MMABS(a,z)         MMABS_MMX2(a,z)
1792
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1793
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1794
HADAMARD8_DIFF_MMX(mmx2)
1795
HADAMARD8_DIFF_SSE2(sse2)
1796
#undef MMABS
1797
#undef MMABS_SUM_8x8
1798
#undef HSUM
1799

    
1800
#ifdef HAVE_SSSE3
1801
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1802
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1803
HADAMARD8_DIFF_SSE2(ssse3)
1804
#undef MMABS
1805
#undef MMABS_SUM_8x8
1806
#endif
1807

    
1808
#define DCT_SAD4(m,mm,o)\
1809
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1810
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1811
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1812
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1813
    MMABS_SUM(mm##2, mm##6, mm##0)\
1814
    MMABS_SUM(mm##3, mm##7, mm##1)\
1815
    MMABS_SUM(mm##4, mm##6, mm##0)\
1816
    MMABS_SUM(mm##5, mm##7, mm##1)\
1817

    
1818
#define DCT_SAD_MMX\
1819
    "pxor %%mm0, %%mm0                \n\t"\
1820
    "pxor %%mm1, %%mm1                \n\t"\
1821
    DCT_SAD4(q, %%mm, 0)\
1822
    DCT_SAD4(q, %%mm, 8)\
1823
    DCT_SAD4(q, %%mm, 64)\
1824
    DCT_SAD4(q, %%mm, 72)\
1825
    "paddusw %%mm1, %%mm0             \n\t"\
1826
    HSUM(%%mm0, %%mm1, %0)
1827

    
1828
#define DCT_SAD_SSE2\
1829
    "pxor %%xmm0, %%xmm0              \n\t"\
1830
    "pxor %%xmm1, %%xmm1              \n\t"\
1831
    DCT_SAD4(dqa, %%xmm, 0)\
1832
    DCT_SAD4(dqa, %%xmm, 64)\
1833
    "paddusw %%xmm1, %%xmm0           \n\t"\
1834
    HSUM(%%xmm0, %%xmm1, %0)
1835

    
1836
#define DCT_SAD_FUNC(cpu) \
1837
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1838
    int sum;\
1839
    asm volatile(\
1840
        DCT_SAD\
1841
        :"=r"(sum)\
1842
        :"r"(block)\
1843
    );\
1844
    return sum&0xFFFF;\
1845
}
1846

    
1847
#define DCT_SAD       DCT_SAD_MMX
1848
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1849
#define MMABS(a,z)    MMABS_MMX(a,z)
1850
DCT_SAD_FUNC(mmx)
1851
#undef MMABS
1852
#undef HSUM
1853

    
1854
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1855
#define MMABS(a,z)    MMABS_MMX2(a,z)
1856
DCT_SAD_FUNC(mmx2)
1857
#undef HSUM
1858
#undef DCT_SAD
1859

    
1860
#define DCT_SAD       DCT_SAD_SSE2
1861
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1862
DCT_SAD_FUNC(sse2)
1863
#undef MMABS
1864

    
1865
#ifdef HAVE_SSSE3
1866
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1867
DCT_SAD_FUNC(ssse3)
1868
#undef MMABS
1869
#endif
1870
#undef HSUM
1871
#undef DCT_SAD
1872

    
1873
static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
1874
    int sum;
1875
    long i=size;
1876
    asm volatile(
1877
        "pxor %%mm4, %%mm4 \n"
1878
        "1: \n"
1879
        "sub $8, %0 \n"
1880
        "movq (%2,%0), %%mm2 \n"
1881
        "movq (%3,%0,2), %%mm0 \n"
1882
        "movq 8(%3,%0,2), %%mm1 \n"
1883
        "punpckhbw %%mm2, %%mm3 \n"
1884
        "punpcklbw %%mm2, %%mm2 \n"
1885
        "psraw $8, %%mm3 \n"
1886
        "psraw $8, %%mm2 \n"
1887
        "psubw %%mm3, %%mm1 \n"
1888
        "psubw %%mm2, %%mm0 \n"
1889
        "pmaddwd %%mm1, %%mm1 \n"
1890
        "pmaddwd %%mm0, %%mm0 \n"
1891
        "paddd %%mm1, %%mm4 \n"
1892
        "paddd %%mm0, %%mm4 \n"
1893
        "jg 1b \n"
1894
        "movq %%mm4, %%mm3 \n"
1895
        "psrlq $32, %%mm3 \n"
1896
        "paddd %%mm3, %%mm4 \n"
1897
        "movd %%mm4, %1 \n"
1898
        :"+r"(i), "=r"(sum)
1899
        :"r"(pix1), "r"(pix2)
1900
    );
1901
    return sum;
1902
}
1903

    
1904
#endif //CONFIG_ENCODERS
1905

    
1906
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1907
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1908

    
1909
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1910
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1911
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1912
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1913
        "movq "#in7", " #m3 "             \n\t" /* d */\
1914
        "movq "#in0", %%mm5               \n\t" /* D */\
1915
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1916
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1917
        "movq "#in1", %%mm5               \n\t" /* C */\
1918
        "movq "#in2", %%mm6               \n\t" /* B */\
1919
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1920
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1921
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1922
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1923
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1924
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1925
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1926
        "psraw $5, %%mm5                  \n\t"\
1927
        "packuswb %%mm5, %%mm5            \n\t"\
1928
        OP(%%mm5, out, %%mm7, d)
1929

    
1930
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1931
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1932
    uint64_t temp;\
1933
\
1934
    asm volatile(\
1935
        "pxor %%mm7, %%mm7                \n\t"\
1936
        "1:                               \n\t"\
1937
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1938
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1939
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1940
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1941
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1942
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1943
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1944
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1945
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1946
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1947
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1948
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1949
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1950
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1951
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1952
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1953
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1954
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1955
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1956
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1957
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1958
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1959
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1960
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1961
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1962
        "paddw %6, %%mm6                  \n\t"\
1963
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1964
        "psraw $5, %%mm0                  \n\t"\
1965
        "movq %%mm0, %5                   \n\t"\
1966
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1967
        \
1968
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1969
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1970
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1971
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1972
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1973
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1974
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1975
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1976
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1977
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1978
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1979
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1980
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1981
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1982
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1983
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1984
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1985
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1986
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1987
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
1988
        "paddw %6, %%mm1                  \n\t"\
1989
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
1990
        "psraw $5, %%mm3                  \n\t"\
1991
        "movq %5, %%mm1                   \n\t"\
1992
        "packuswb %%mm3, %%mm1            \n\t"\
1993
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
1994
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1995
        \
1996
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
1997
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
1998
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
1999
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
2000
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
2001
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
2002
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
2003
        "paddw %%mm1, %%mm5               \n\t" /* b */\
2004
        "paddw %%mm4, %%mm0               \n\t" /* c */\
2005
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2006
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
2007
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
2008
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
2009
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
2010
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
2011
        "paddw %%mm3, %%mm2               \n\t" /* d */\
2012
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
2013
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
2014
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
2015
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
2016
        "paddw %%mm2, %%mm6               \n\t" /* a */\
2017
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2018
        "paddw %6, %%mm0                  \n\t"\
2019
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2020
        "psraw $5, %%mm0                  \n\t"\
2021
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2022
        \
2023
        "paddw %%mm5, %%mm3               \n\t" /* a */\
2024
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
2025
        "paddw %%mm4, %%mm6               \n\t" /* b */\
2026
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
2027
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
2028
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2029
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2030
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2031
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2032
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2033
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2034
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2035
        "paddw %6, %%mm4                  \n\t"\
2036
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2037
        "psraw $5, %%mm4                  \n\t"\
2038
        "packuswb %%mm4, %%mm0            \n\t"\
2039
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2040
        \
2041
        "add %3, %0                       \n\t"\
2042
        "add %4, %1                       \n\t"\
2043
        "decl %2                          \n\t"\
2044
        " jnz 1b                          \n\t"\
2045
        : "+a"(src), "+c"(dst), "+m"(h)\
2046
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2047
        : "memory"\
2048
    );\
2049
}\
2050
\
2051
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2052
    int i;\
2053
    int16_t temp[16];\
2054
    /* quick HACK, XXX FIXME MUST be optimized */\
2055
    for(i=0; i<h; i++)\
2056
    {\
2057
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2058
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2059
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2060
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2061
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2062
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2063
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2064
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2065
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2066
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2067
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2068
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2069
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2070
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2071
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2072
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2073
        asm volatile(\
2074
            "movq (%0), %%mm0               \n\t"\
2075
            "movq 8(%0), %%mm1              \n\t"\
2076
            "paddw %2, %%mm0                \n\t"\
2077
            "paddw %2, %%mm1                \n\t"\
2078
            "psraw $5, %%mm0                \n\t"\
2079
            "psraw $5, %%mm1                \n\t"\
2080
            "packuswb %%mm1, %%mm0          \n\t"\
2081
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2082
            "movq 16(%0), %%mm0             \n\t"\
2083
            "movq 24(%0), %%mm1             \n\t"\
2084
            "paddw %2, %%mm0                \n\t"\
2085
            "paddw %2, %%mm1                \n\t"\
2086
            "psraw $5, %%mm0                \n\t"\
2087
            "psraw $5, %%mm1                \n\t"\
2088
            "packuswb %%mm1, %%mm0          \n\t"\
2089
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2090
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2091
            : "memory"\
2092
        );\
2093
        dst+=dstStride;\
2094
        src+=srcStride;\
2095
    }\
2096
}\
2097
\
2098
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2099
    uint64_t temp;\
2100
\
2101
    asm volatile(\
2102
        "pxor %%mm7, %%mm7                \n\t"\
2103
        "1:                               \n\t"\
2104
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2105
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2106
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2107
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2108
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2109
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2110
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2111
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2112
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2113
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2114
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2115
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2116
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2117
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2118
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2119
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2120
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2121
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2122
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2123
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2124
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2125
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2126
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2127
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2128
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2129
        "paddw %6, %%mm6                  \n\t"\
2130
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2131
        "psraw $5, %%mm0                  \n\t"\
2132
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2133
        \
2134
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2135
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2136
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2137
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2138
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2139
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2140
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2141
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2142
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2143
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2144
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2145
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2146
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2147
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2148
        "paddw %6, %%mm1                  \n\t"\
2149
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2150
        "psraw $5, %%mm3                  \n\t"\
2151
        "packuswb %%mm3, %%mm0            \n\t"\
2152
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2153
        \
2154
        "add %3, %0                       \n\t"\
2155
        "add %4, %1                       \n\t"\
2156
        "decl %2                          \n\t"\
2157
        " jnz 1b                          \n\t"\
2158
        : "+a"(src), "+c"(dst), "+m"(h)\
2159
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2160
        : "memory"\
2161
    );\
2162
}\
2163
\
2164
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2165
    int i;\
2166
    int16_t temp[8];\
2167
    /* quick HACK, XXX FIXME MUST be optimized */\
2168
    for(i=0; i<h; i++)\
2169
    {\
2170
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2171
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2172
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2173
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2174
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2175
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2176
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2177
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2178
        asm volatile(\
2179
            "movq (%0), %%mm0           \n\t"\
2180
            "movq 8(%0), %%mm1          \n\t"\
2181
            "paddw %2, %%mm0            \n\t"\
2182
            "paddw %2, %%mm1            \n\t"\
2183
            "psraw $5, %%mm0            \n\t"\
2184
            "psraw $5, %%mm1            \n\t"\
2185
            "packuswb %%mm1, %%mm0      \n\t"\
2186
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2187
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2188
            :"memory"\
2189
        );\
2190
        dst+=dstStride;\
2191
        src+=srcStride;\
2192
    }\
2193
}
2194

    
2195
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2196
\
2197
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2198
    uint64_t temp[17*4];\
2199
    uint64_t *temp_ptr= temp;\
2200
    int count= 17;\
2201
\
2202
    /*FIXME unroll */\
2203
    asm volatile(\
2204
        "pxor %%mm7, %%mm7              \n\t"\
2205
        "1:                             \n\t"\
2206
        "movq (%0), %%mm0               \n\t"\
2207
        "movq (%0), %%mm1               \n\t"\
2208
        "movq 8(%0), %%mm2              \n\t"\
2209
        "movq 8(%0), %%mm3              \n\t"\
2210
        "punpcklbw %%mm7, %%mm0         \n\t"\
2211
        "punpckhbw %%mm7, %%mm1         \n\t"\
2212
        "punpcklbw %%mm7, %%mm2         \n\t"\
2213
        "punpckhbw %%mm7, %%mm3         \n\t"\
2214
        "movq %%mm0, (%1)               \n\t"\
2215
        "movq %%mm1, 17*8(%1)           \n\t"\
2216
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2217
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2218
        "add $8, %1                     \n\t"\
2219
        "add %3, %0                     \n\t"\
2220
        "decl %2                        \n\t"\
2221
        " jnz 1b                        \n\t"\
2222
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2223
        : "r" ((long)srcStride)\
2224
        : "memory"\
2225
    );\
2226
    \
2227
    temp_ptr= temp;\
2228
    count=4;\
2229
    \
2230
/*FIXME reorder for speed */\
2231
    asm volatile(\
2232
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2233
        "1:                             \n\t"\
2234
        "movq (%0), %%mm0               \n\t"\
2235
        "movq 8(%0), %%mm1              \n\t"\
2236
        "movq 16(%0), %%mm2             \n\t"\
2237
        "movq 24(%0), %%mm3             \n\t"\
2238
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2239
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2240
        "add %4, %1                     \n\t"\
2241
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2242
        \
2243
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2244
        "add %4, %1                     \n\t"\
2245
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2246
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2247
        "add %4, %1                     \n\t"\
2248
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2249
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2250
        "add %4, %1                     \n\t"\
2251
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2252
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2253
        "add %4, %1                     \n\t"\
2254
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2255
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2256
        "add %4, %1                     \n\t"\
2257
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2258
        \
2259
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2260
        "add %4, %1                     \n\t"  \
2261
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2262
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2263
        \
2264
        "add $136, %0                   \n\t"\
2265
        "add %6, %1                     \n\t"\
2266
        "decl %2                        \n\t"\
2267
        " jnz 1b                        \n\t"\
2268
        \
2269
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2270
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2271
        :"memory"\
2272
    );\
2273
}\
2274
\
2275
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2276
    uint64_t temp[9*2];\
2277
    uint64_t *temp_ptr= temp;\
2278
    int count= 9;\
2279
\
2280
    /*FIXME unroll */\
2281
    asm volatile(\
2282
        "pxor %%mm7, %%mm7              \n\t"\
2283
        "1:                             \n\t"\
2284
        "movq (%0), %%mm0               \n\t"\
2285
        "movq (%0), %%mm1               \n\t"\
2286
        "punpcklbw %%mm7, %%mm0         \n\t"\
2287
        "punpckhbw %%mm7, %%mm1         \n\t"\
2288
        "movq %%mm0, (%1)               \n\t"\
2289
        "movq %%mm1, 9*8(%1)            \n\t"\
2290
        "add $8, %1                     \n\t"\
2291
        "add %3, %0                     \n\t"\
2292
        "decl %2                        \n\t"\
2293
        " jnz 1b                        \n\t"\
2294
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2295
        : "r" ((long)srcStride)\
2296
        : "memory"\
2297
    );\
2298
    \
2299
    temp_ptr= temp;\
2300
    count=2;\
2301
    \
2302
/*FIXME reorder for speed */\
2303
    asm volatile(\
2304
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2305
        "1:                             \n\t"\
2306
        "movq (%0), %%mm0               \n\t"\
2307
        "movq 8(%0), %%mm1              \n\t"\
2308
        "movq 16(%0), %%mm2             \n\t"\
2309
        "movq 24(%0), %%mm3             \n\t"\
2310
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2311
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2312
        "add %4, %1                     \n\t"\
2313
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2314
        \
2315
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2316
        "add %4, %1                     \n\t"\
2317
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2318
        \
2319
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2320
        "add %4, %1                     \n\t"\
2321
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2322
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2323
                \
2324
        "add $72, %0                    \n\t"\
2325
        "add %6, %1                     \n\t"\
2326
        "decl %2                        \n\t"\
2327
        " jnz 1b                        \n\t"\
2328
         \
2329
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2330
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2331
        : "memory"\
2332
   );\
2333
}\
2334
\
2335
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2336
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2337
}\
2338
\
2339
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2340
    uint64_t temp[8];\
2341
    uint8_t * const half= (uint8_t*)temp;\
2342
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2343
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2344
}\
2345
\
2346
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2347
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2348
}\
2349
\
2350
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2351
    uint64_t temp[8];\
2352
    uint8_t * const half= (uint8_t*)temp;\
2353
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2354
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2355
}\
2356
\
2357
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2358
    uint64_t temp[8];\
2359
    uint8_t * const half= (uint8_t*)temp;\
2360
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2361
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2362
}\
2363
\
2364
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2365
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2366
}\
2367
\
2368
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2369
    uint64_t temp[8];\
2370
    uint8_t * const half= (uint8_t*)temp;\
2371
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2372
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2373
}\
2374
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2375
    uint64_t half[8 + 9];\
2376
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2377
    uint8_t * const halfHV= ((uint8_t*)half);\
2378
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2379
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2380
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2381
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2382
}\
2383
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2384
    uint64_t half[8 + 9];\
2385
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2386
    uint8_t * const halfHV= ((uint8_t*)half);\
2387
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2388
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2389
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2390
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2391
}\
2392
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2393
    uint64_t half[8 + 9];\
2394
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2395
    uint8_t * const halfHV= ((uint8_t*)half);\
2396
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2397
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2398
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2399
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2400
}\
2401
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2402
    uint64_t half[8 + 9];\
2403
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2404
    uint8_t * const halfHV= ((uint8_t*)half);\
2405
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2406
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2407
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2408
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2409
}\
2410
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2411
    uint64_t half[8 + 9];\
2412
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2413
    uint8_t * const halfHV= ((uint8_t*)half);\
2414
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2415
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2416
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2417
}\
2418
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2419
    uint64_t half[8 + 9];\
2420
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2421
    uint8_t * const halfHV= ((uint8_t*)half);\
2422
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2423
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2424
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2425
}\
2426
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2427
    uint64_t half[8 + 9];\
2428
    uint8_t * const halfH= ((uint8_t*)half);\
2429
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2430
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2431
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2432
}\
2433
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2434
    uint64_t half[8 + 9];\
2435
    uint8_t * const halfH= ((uint8_t*)half);\
2436
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2437
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2438
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2439
}\
2440
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2441
    uint64_t half[9];\
2442
    uint8_t * const halfH= ((uint8_t*)half);\
2443
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2444
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2445
}\
2446
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2447
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2448
}\
2449
\
2450
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2451
    uint64_t temp[32];\
2452
    uint8_t * const half= (uint8_t*)temp;\
2453
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2454
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2455
}\
2456
\
2457
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2458
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2459
}\
2460
\
2461
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2462
    uint64_t temp[32];\
2463
    uint8_t * const half= (uint8_t*)temp;\
2464
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2465
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2466
}\
2467
\
2468
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2469
    uint64_t temp[32];\
2470
    uint8_t * const half= (uint8_t*)temp;\
2471
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2472
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2473
}\
2474
\
2475
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2476
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2477
}\
2478
\
2479
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2480
    uint64_t temp[32];\
2481
    uint8_t * const half= (uint8_t*)temp;\
2482
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2483
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2484
}\
2485
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2486
    uint64_t half[16*2 + 17*2];\
2487
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2488
    uint8_t * const halfHV= ((uint8_t*)half);\
2489
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2490
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2491
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2492
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2493
}\
2494
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2495
    uint64_t half[16*2 + 17*2];\
2496
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2497
    uint8_t * const halfHV= ((uint8_t*)half);\
2498
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2499
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2500
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2501
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2502
}\
2503
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2504
    uint64_t half[16*2 + 17*2];\
2505
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2506
    uint8_t * const halfHV= ((uint8_t*)half);\
2507
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2508
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2509
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2510
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2511
}\
2512
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2513
    uint64_t half[16*2 + 17*2];\
2514
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2515
    uint8_t * const halfHV= ((uint8_t*)half);\
2516
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2517
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2518
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2519
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2520
}\
2521
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2522
    uint64_t half[16*2 + 17*2];\
2523
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2524
    uint8_t * const halfHV= ((uint8_t*)half);\
2525
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2526
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2527
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2528
}\
2529
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2530
    uint64_t half[16*2 + 17*2];\
2531
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2532
    uint8_t * const halfHV= ((uint8_t*)half);\
2533
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2534
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2535
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2536
}\
2537
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2538
    uint64_t half[17*2];\
2539
    uint8_t * const halfH= ((uint8_t*)half);\
2540
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2541
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2542
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2543
}\
2544
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2545
    uint64_t half[17*2];\
2546
    uint8_t * const halfH= ((uint8_t*)half);\
2547
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2548
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2549
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2550
}\
2551
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2552
    uint64_t half[17*2];\
2553
    uint8_t * const halfH= ((uint8_t*)half);\
2554
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2555
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2556
}
2557

    
2558
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2559
#define AVG_3DNOW_OP(a,b,temp, size) \
2560
"mov" #size " " #b ", " #temp "   \n\t"\
2561
"pavgusb " #temp ", " #a "        \n\t"\
2562
"mov" #size " " #a ", " #b "      \n\t"
2563
#define AVG_MMX2_OP(a,b,temp, size) \
2564
"mov" #size " " #b ", " #temp "   \n\t"\
2565
"pavgb " #temp ", " #a "          \n\t"\
2566
"mov" #size " " #a ", " #b "      \n\t"
2567

    
2568
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2569
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2570
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2571
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2572
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2573
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2574
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2575
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2576
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2577

    
2578
/***********************************/
2579
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2580

    
2581
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2582
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2583
    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2584
}
2585
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2586
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2587
    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2588
}
2589

    
2590
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
2591
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2592
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2593
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2594
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2595
                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2596
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2597
                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2598
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2599
                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2600
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2601
    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2602
}\
2603
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2604
    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2605
}\
2606
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
2607
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
2608
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
2609
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
2610
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
2611
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
2612
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
2613
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2614

    
2615
QPEL_2TAP(put_, 16, mmx2)
2616
QPEL_2TAP(avg_, 16, mmx2)
2617
QPEL_2TAP(put_,  8, mmx2)
2618
QPEL_2TAP(avg_,  8, mmx2)
2619
QPEL_2TAP(put_, 16, 3dnow)
2620
QPEL_2TAP(avg_, 16, 3dnow)
2621
QPEL_2TAP(put_,  8, 3dnow)
2622
QPEL_2TAP(avg_,  8, 3dnow)
2623

    
2624

    
2625
#if 0
2626
static void just_return() { return; }
2627
#endif
2628

    
2629
#define SET_QPEL_FUNC(postfix1, postfix2) \
2630
    c->put_ ## postfix1 = put_ ## postfix2;\
2631
    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2632
    c->avg_ ## postfix1 = avg_ ## postfix2;
2633

    
2634
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2635
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2636
    const int w = 8;
2637
    const int ix = ox>>(16+shift);
2638
    const int iy = oy>>(16+shift);
2639
    const int oxs = ox>>4;
2640
    const int oys = oy>>4;
2641
    const int dxxs = dxx>>4;
2642
    const int dxys = dxy>>4;
2643
    const int dyxs = dyx>>4;
2644
    const int dyys = dyy>>4;
2645
    const uint16_t r4[4] = {r,r,r,r};
2646
    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2647
    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2648
    const uint64_t shift2 = 2*shift;
2649
    uint8_t edge_buf[(h+1)*stride];
2650
    int x, y;
2651

    
2652
    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2653
    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2654
    const int dxh = dxy*(h-1);
2655
    const int dyw = dyx*(w-1);
2656
    if( // non-constant fullpel offset (3% of blocks)
2657
        (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2658
         oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2659
        // uses more than 16 bits of subpel mv (only at huge resolution)
2660
        || (dxx|dxy|dyx|dyy)&15 )
2661
    {
2662
        //FIXME could still use mmx for some of the rows
2663
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2664
        return;
2665
    }
2666

    
2667
    src += ix + iy*stride;
2668
    if( (unsigned)ix >= width-w ||
2669
        (unsigned)iy >= height-h )
2670
    {
2671
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2672
        src = edge_buf;
2673
    }
2674

    
2675
    asm volatile(
2676
        "movd         %0, %%mm6 \n\t"
2677
        "pxor      %%mm7, %%mm7 \n\t"
2678
        "punpcklwd %%mm6, %%mm6 \n\t"
2679
        "punpcklwd %%mm6, %%mm6 \n\t"
2680
        :: "r"(1<<shift)
2681
    );
2682

    
2683
    for(x=0; x<w; x+=4){
2684
        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2685
                            oxs - dxys + dxxs*(x+1),
2686
                            oxs - dxys + dxxs*(x+2),
2687
                            oxs - dxys + dxxs*(x+3) };
2688
        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2689
                            oys - dyys + dyxs*(x+1),
2690
                            oys - dyys + dyxs*(x+2),
2691
                            oys - dyys + dyxs*(x+3) };
2692

    
2693
        for(y=0; y<h; y++){
2694
            asm volatile(
2695
                "movq   %0,  %%mm4 \n\t"
2696
                "movq   %1,  %%mm5 \n\t"
2697
                "paddw  %2,  %%mm4 \n\t"
2698
                "paddw  %3,  %%mm5 \n\t"
2699
                "movq   %%mm4, %0  \n\t"
2700
                "movq   %%mm5, %1  \n\t"
2701
                "psrlw  $12, %%mm4 \n\t"
2702
                "psrlw  $12, %%mm5 \n\t"
2703
                : "+m"(*dx4), "+m"(*dy4)
2704
                : "m"(*dxy4), "m"(*dyy4)
2705
            );
2706

    
2707
            asm volatile(
2708
                "movq   %%mm6, %%mm2 \n\t"
2709
                "movq   %%mm6, %%mm1 \n\t"
2710
                "psubw  %%mm4, %%mm2 \n\t"
2711
                "psubw  %%mm5, %%mm1 \n\t"
2712
                "movq   %%mm2, %%mm0 \n\t"
2713
                "movq   %%mm4, %%mm3 \n\t"
2714
                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2715
                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2716
                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2717
                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2718

    
2719
                "movd   %4,    %%mm5 \n\t"
2720
                "movd   %3,    %%mm4 \n\t"
2721
                "punpcklbw %%mm7, %%mm5 \n\t"
2722
                "punpcklbw %%mm7, %%mm4 \n\t"
2723
                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2724
                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2725

    
2726
                "movd   %2,    %%mm5 \n\t"
2727
                "movd   %1,    %%mm4 \n\t"
2728
                "punpcklbw %%mm7, %%mm5 \n\t"
2729
                "punpcklbw %%mm7, %%mm4 \n\t"
2730
                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2731
                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2732
                "paddw  %5,    %%mm1 \n\t"
2733
                "paddw  %%mm3, %%mm2 \n\t"
2734
                "paddw  %%mm1, %%mm0 \n\t"
2735
                "paddw  %%mm2, %%mm0 \n\t"
2736

    
2737
                "psrlw    %6,    %%mm0 \n\t"
2738
                "packuswb %%mm0, %%mm0 \n\t"
2739
                "movd     %%mm0, %0    \n\t"
2740

    
2741
                : "=m"(dst[x+y*stride])
2742
                : "m"(src[0]), "m"(src[1]),
2743
                  "m"(src[stride]), "m"(src[stride+1]),
2744
                  "m"(*r4), "m"(shift2)
2745
            );
2746
            src += stride;
2747
        }
2748
        src += 4-h*stride;
2749
    }
2750
}
2751

    
2752
#ifdef CONFIG_ENCODERS
2753
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2754
    long i=0;
2755

    
2756
    assert(FFABS(scale) < 256);
2757
    scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2758

    
2759
    asm volatile(
2760
        "pcmpeqw %%mm6, %%mm6           \n\t" // -1w
2761
        "psrlw $15, %%mm6               \n\t" //  1w
2762
        "pxor %%mm7, %%mm7              \n\t"
2763
        "movd  %4, %%mm5                \n\t"
2764
        "punpcklwd %%mm5, %%mm5         \n\t"
2765
        "punpcklwd %%mm5, %%mm5         \n\t"
2766
        "1:                             \n\t"
2767
        "movq  (%1, %0), %%mm0          \n\t"
2768
        "movq  8(%1, %0), %%mm1         \n\t"
2769
        "pmulhw %%mm5, %%mm0            \n\t"
2770
        "pmulhw %%mm5, %%mm1            \n\t"
2771
        "paddw %%mm6, %%mm0             \n\t"
2772
        "paddw %%mm6, %%mm1             \n\t"
2773
        "psraw $1, %%mm0                \n\t"
2774
        "psraw $1, %%mm1                \n\t"
2775
        "paddw (%2, %0), %%mm0          \n\t"
2776
        "paddw 8(%2, %0), %%mm1         \n\t"
2777
        "psraw $6, %%mm0                \n\t"
2778
        "psraw $6, %%mm1                \n\t"
2779
        "pmullw (%3, %0), %%mm0         \n\t"
2780
        "pmullw 8(%3, %0), %%mm1        \n\t"
2781
        "pmaddwd %%mm0, %%mm0           \n\t"
2782
        "pmaddwd %%mm1, %%mm1           \n\t"
2783
        "paddd %%mm1, %%mm0             \n\t"
2784
        "psrld $4, %%mm0                \n\t"
2785
        "paddd %%mm0, %%mm7             \n\t"
2786
        "add $16, %0                    \n\t"
2787
        "cmp $128, %0                   \n\t" //FIXME optimize & bench
2788
        " jb 1b                         \n\t"
2789
        "movq %%mm7, %%mm6              \n\t"
2790
        "psrlq $32, %%mm7               \n\t"
2791
        "paddd %%mm6, %%mm7             \n\t"
2792
        "psrld $2, %%mm7                \n\t"
2793
        "movd %%mm7, %0                 \n\t"
2794

    
2795
        : "+r" (i)
2796
        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
2797
    );
2798
    return i;
2799
}
2800

    
2801
static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
2802
    long i=0;
2803

    
2804
    if(FFABS(scale) < 256){
2805
        scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2806
        asm volatile(
2807
                "pcmpeqw %%mm6, %%mm6   \n\t" // -1w
2808
                "psrlw $15, %%mm6       \n\t" //  1w
2809
                "movd  %3, %%mm5        \n\t"
2810
                "punpcklwd %%mm5, %%mm5 \n\t"
2811
                "punpcklwd %%mm5, %%mm5 \n\t"
2812
                "1:                     \n\t"
2813
                "movq  (%1, %0), %%mm0  \n\t"
2814
                "movq  8(%1, %0), %%mm1 \n\t"
2815
                "pmulhw %%mm5, %%mm0    \n\t"
2816
                "pmulhw %%mm5, %%mm1    \n\t"
2817
                "paddw %%mm6, %%mm0     \n\t"
2818
                "paddw %%mm6, %%mm1     \n\t"
2819
                "psraw $1, %%mm0        \n\t"
2820
                "psraw $1, %%mm1        \n\t"
2821
                "paddw (%2, %0), %%mm0  \n\t"
2822
                "paddw 8(%2, %0), %%mm1 \n\t"
2823
                "movq %%mm0, (%2, %0)   \n\t"
2824
                "movq %%mm1, 8(%2, %0)  \n\t"
2825
                "add $16, %0            \n\t"
2826
                "cmp $128, %0           \n\t" //FIXME optimize & bench
2827
                " jb 1b                 \n\t"
2828

    
2829
                : "+r" (i)
2830
                : "r"(basis), "r"(rem), "g"(scale)
2831
        );
2832
    }else{
2833
        for(i=0; i<8*8; i++){
2834
            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2835
        }
2836
    }
2837
}
2838
#endif /* CONFIG_ENCODERS */
2839

    
2840
#define PREFETCH(name, op) \
2841
static void name(void *mem, int stride, int h){\
2842
    const uint8_t *p= mem;\
2843
    do{\
2844
        asm volatile(#op" %0" :: "m"(*p));\
2845
        p+= stride;\
2846
    }while(--h);\
2847
}
2848
PREFETCH(prefetch_mmx2,  prefetcht0)
2849
PREFETCH(prefetch_3dnow, prefetch)
2850
#undef PREFETCH
2851

    
2852
#include "h264dsp_mmx.c"
2853

    
2854
/* AVS specific */
2855
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2856

    
2857
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2858
    put_pixels8_mmx(dst, src, stride, 8);
2859
}
2860
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2861
    avg_pixels8_mmx(dst, src, stride, 8);
2862
}
2863
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2864
    put_pixels16_mmx(dst, src, stride, 16);
2865
}
2866
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2867
    avg_pixels16_mmx(dst, src, stride, 16);
2868
}
2869

    
2870
/* external functions, from idct_mmx.c */
2871
void ff_mmx_idct(DCTELEM *block);
2872
void ff_mmxext_idct(DCTELEM *block);
2873

    
2874
void ff_vp3_idct_sse2(int16_t *input_data);
2875
void ff_vp3_idct_mmx(int16_t *data);
2876
void ff_vp3_dsp_init_mmx(void);
2877

    
2878
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2879
   converted */
2880
#ifdef CONFIG_GPL
2881
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2882
{
2883
    ff_mmx_idct (block);
2884
    put_pixels_clamped_mmx(block, dest, line_size);
2885
}
2886
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2887
{
2888
    ff_mmx_idct (block);
2889
    add_pixels_clamped_mmx(block, dest, line_size);
2890
}
2891
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2892
{
2893
    ff_mmxext_idct (block);
2894
    put_pixels_clamped_mmx(block, dest, line_size);
2895
}
2896
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2897
{
2898
    ff_mmxext_idct (block);
2899
    add_pixels_clamped_mmx(block, dest, line_size);
2900
}
2901
#endif
2902
static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
2903
{
2904
    ff_vp3_idct_sse2(block);
2905
    put_signed_pixels_clamped_mmx(block, dest, line_size);
2906
}
2907
static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
2908
{
2909
    ff_vp3_idct_sse2(block);
2910
    add_pixels_clamped_mmx(block, dest, line_size);
2911
}
2912
static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2913
{
2914
    ff_vp3_idct_mmx(block);
2915
    put_signed_pixels_clamped_mmx(block, dest, line_size);
2916
}
2917
static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2918
{
2919
    ff_vp3_idct_mmx(block);
2920
    add_pixels_clamped_mmx(block, dest, line_size);
2921
}
2922
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2923
{
2924
    ff_idct_xvid_mmx (block);
2925
    put_pixels_clamped_mmx(block, dest, line_size);
2926
}
2927
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2928
{
2929
    ff_idct_xvid_mmx (block);
2930
    add_pixels_clamped_mmx(block, dest, line_size);
2931
}
2932
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2933
{
2934
    ff_idct_xvid_mmx2 (block);
2935
    put_pixels_clamped_mmx(block, dest, line_size);
2936
}
2937
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2938
{
2939
    ff_idct_xvid_mmx2 (block);
2940
    add_pixels_clamped_mmx(block, dest, line_size);
2941
}
2942

    
2943
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2944
{
2945
    int i;
2946
    asm volatile("pxor %%mm7, %%mm7":);
2947
    for(i=0; i<blocksize; i+=2) {
2948
        asm volatile(
2949
            "movq    %0,    %%mm0 \n\t"
2950
            "movq    %1,    %%mm1 \n\t"
2951
            "movq    %%mm0, %%mm2 \n\t"
2952
            "movq    %%mm1, %%mm3 \n\t"
2953
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2954
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2955
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2956
            "pxor    %%mm2, %%mm1 \n\t"
2957
            "movq    %%mm3, %%mm4 \n\t"
2958
            "pand    %%mm1, %%mm3 \n\t"
2959
            "pandn   %%mm1, %%mm4 \n\t"
2960
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2961
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2962
            "movq    %%mm3, %1    \n\t"
2963
            "movq    %%mm0, %0    \n\t"
2964
            :"+m"(mag[i]), "+m"(ang[i])
2965
            ::"memory"
2966
        );
2967
    }
2968
    asm volatile("femms");
2969
}
2970
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2971
{
2972
    int i;
2973

    
2974
    asm volatile(
2975
            "movaps  %0,     %%xmm5 \n\t"
2976
        ::"m"(ff_pdw_80000000[0])
2977
    );
2978
    for(i=0; i<blocksize; i+=4) {
2979
        asm volatile(
2980
            "movaps  %0,     %%xmm0 \n\t"
2981
            "movaps  %1,     %%xmm1 \n\t"
2982
            "xorps   %%xmm2, %%xmm2 \n\t"
2983
            "xorps   %%xmm3, %%xmm3 \n\t"
2984
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2985
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2986
            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2987
            "xorps   %%xmm2, %%xmm1 \n\t"
2988
            "movaps  %%xmm3, %%xmm4 \n\t"
2989
            "andps   %%xmm1, %%xmm3 \n\t"
2990
            "andnps  %%xmm1, %%xmm4 \n\t"
2991
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2992
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2993
            "movaps  %%xmm3, %1     \n\t"
2994
            "movaps  %%xmm0, %0     \n\t"
2995
            :"+m"(mag[i]), "+m"(ang[i])
2996
            ::"memory"
2997
        );
2998
    }
2999
}
3000

    
3001
static void vector_fmul_3dnow(float *dst, const float *src, int len){
3002
    long i = (len-4)*4;
3003
    asm volatile(
3004
        "1: \n\t"
3005
        "movq    (%1,%0), %%mm0 \n\t"
3006
        "movq   8(%1,%0), %%mm1 \n\t"
3007
        "pfmul   (%2,%0), %%mm0 \n\t"
3008
        "pfmul  8(%2,%0), %%mm1 \n\t"
3009
        "movq   %%mm0,  (%1,%0) \n\t"
3010
        "movq   %%mm1, 8(%1,%0) \n\t"
3011
        "sub  $16, %0 \n\t"
3012
        "jge 1b \n\t"
3013
        "femms  \n\t"
3014
        :"+r"(i)
3015
        :"r"(dst), "r"(src)
3016
        :"memory"
3017
    );
3018
}
3019
static void vector_fmul_sse(float *dst, const float *src, int len){
3020
    long i = (len-8)*4;
3021
    asm volatile(
3022
        "1: \n\t"
3023
        "movaps    (%1,%0), %%xmm0 \n\t"
3024
        "movaps  16(%1,%0), %%xmm1 \n\t"
3025
        "mulps     (%2,%0), %%xmm0 \n\t"
3026
        "mulps   16(%2,%0), %%xmm1 \n\t"
3027
        "movaps  %%xmm0,   (%1,%0) \n\t"
3028
        "movaps  %%xmm1, 16(%1,%0) \n\t"
3029
        "sub  $32, %0 \n\t"
3030
        "jge 1b \n\t"
3031
        :"+r"(i)
3032
        :"r"(dst), "r"(src)
3033
        :"memory"
3034
    );
3035
}
3036

    
3037
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
3038
    long i = len*4-16;
3039
    asm volatile(
3040
        "1: \n\t"
3041
        "pswapd   8(%1), %%mm0 \n\t"
3042
        "pswapd    (%1), %%mm1 \n\t"
3043
        "pfmul  (%3,%0), %%mm0 \n\t"
3044
        "pfmul 8(%3,%0), %%mm1 \n\t"
3045
        "movq  %%mm0,  (%2,%0) \n\t"
3046
        "movq  %%mm1, 8(%2,%0) \n\t"
3047
        "add   $16, %1 \n\t"
3048
        "sub   $16, %0 \n\t"
3049
        "jge   1b \n\t"
3050
        :"+r"(i), "+r"(src1)
3051
        :"r"(dst), "r"(src0)
3052
    );
3053
    asm volatile("femms");
3054
}
3055
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3056
    long i = len*4-32;
3057
    asm volatile(
3058
        "1: \n\t"
3059
        "movaps        16(%1), %%xmm0 \n\t"
3060
        "movaps          (%1), %%xmm1 \n\t"
3061
        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3062
        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3063
        "mulps        (%3,%0), %%xmm0 \n\t"
3064
        "mulps      16(%3,%0), %%xmm1 \n\t"
3065
        "movaps     %%xmm0,   (%2,%0) \n\t"
3066
        "movaps     %%xmm1, 16(%2,%0) \n\t"
3067
        "add    $32, %1 \n\t"
3068
        "sub    $32, %0 \n\t"
3069
        "jge    1b \n\t"
3070
        :"+r"(i), "+r"(src1)
3071
        :"r"(dst), "r"(src0)
3072
    );
3073
}
3074

    
3075
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3076
                                      const float *src2, int src3, int len, int step){
3077
    long i = (len-4)*4;
3078
    if(step == 2 && src3 == 0){
3079
        dst += (len-4)*2;
3080
        asm volatile(
3081
            "1: \n\t"
3082
            "movq   (%2,%0),  %%mm0 \n\t"
3083
            "movq  8(%2,%0),  %%mm1 \n\t"
3084
            "pfmul  (%3,%0),  %%mm0 \n\t"
3085
            "pfmul 8(%3,%0),  %%mm1 \n\t"
3086
            "pfadd  (%4,%0),  %%mm0 \n\t"
3087
            "pfadd 8(%4,%0),  %%mm1 \n\t"
3088
            "movd     %%mm0,   (%1) \n\t"
3089
            "movd     %%mm1, 16(%1) \n\t"
3090
            "psrlq      $32,  %%mm0 \n\t"
3091
            "psrlq      $32,  %%mm1 \n\t"
3092
            "movd     %%mm0,  8(%1) \n\t"
3093
            "movd     %%mm1, 24(%1) \n\t"
3094
            "sub  $32, %1 \n\t"
3095
            "sub  $16, %0 \n\t"
3096
            "jge  1b \n\t"
3097
            :"+r"(i), "+r"(dst)
3098
            :"r"(src0), "r"(src1), "r"(src2)
3099
            :"memory"
3100
        );
3101
    }
3102
    else if(step == 1 && src3 == 0){
3103
        asm volatile(
3104
            "1: \n\t"
3105
            "movq    (%2,%0), %%mm0 \n\t"
3106
            "movq   8(%2,%0), %%mm1 \n\t"
3107
            "pfmul   (%3,%0), %%mm0 \n\t"
3108
            "pfmul  8(%3,%0), %%mm1 \n\t"
3109
            "pfadd   (%4,%0), %%mm0 \n\t"
3110
            "pfadd  8(%4,%0), %%mm1 \n\t"
3111
            "movq  %%mm0,   (%1,%0) \n\t"
3112
            "movq  %%mm1,  8(%1,%0) \n\t"
3113
            "sub  $16, %0 \n\t"
3114
            "jge  1b \n\t"
3115
            :"+r"(i)
3116
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3117
            :"memory"
3118
        );
3119
    }
3120
    else
3121
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3122
    asm volatile("femms");
3123
}
3124
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3125
                                    const float *src2, int src3, int len, int step){
3126
    long i = (len-8)*4;
3127
    if(step == 2 && src3 == 0){
3128
        dst += (len-8)*2;
3129
        asm volatile(
3130
            "1: \n\t"
3131
            "movaps   (%2,%0), %%xmm0 \n\t"
3132
            "movaps 16(%2,%0), %%xmm1 \n\t"
3133
            "mulps    (%3,%0), %%xmm0 \n\t"
3134
            "mulps  16(%3,%0), %%xmm1 \n\t"
3135
            "addps    (%4,%0), %%xmm0 \n\t"
3136
            "addps  16(%4,%0), %%xmm1 \n\t"
3137
            "movss     %%xmm0,   (%1) \n\t"
3138
            "movss     %%xmm1, 32(%1) \n\t"
3139
            "movhlps   %%xmm0, %%xmm2 \n\t"
3140
            "movhlps   %%xmm1, %%xmm3 \n\t"
3141
            "movss     %%xmm2, 16(%1) \n\t"
3142
            "movss     %%xmm3, 48(%1) \n\t"
3143
            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3144
            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3145
            "movss     %%xmm0,  8(%1) \n\t"
3146
            "movss     %%xmm1, 40(%1) \n\t"
3147
            "movhlps   %%xmm0, %%xmm2 \n\t"
3148
            "movhlps   %%xmm1, %%xmm3 \n\t"
3149
            "movss     %%xmm2, 24(%1) \n\t"
3150
            "movss     %%xmm3, 56(%1) \n\t"
3151
            "sub  $64, %1 \n\t"
3152
            "sub  $32, %0 \n\t"
3153
            "jge  1b \n\t"
3154
            :"+r"(i), "+r"(dst)
3155
            :"r"(src0), "r"(src1), "r"(src2)
3156
            :"memory"
3157
        );
3158
    }
3159
    else if(step == 1 && src3 == 0){
3160
        asm volatile(
3161
            "1: \n\t"
3162
            "movaps   (%2,%0), %%xmm0 \n\t"
3163
            "movaps 16(%2,%0), %%xmm1 \n\t"
3164
            "mulps    (%3,%0), %%xmm0 \n\t"
3165
            "mulps  16(%3,%0), %%xmm1 \n\t"
3166
            "addps    (%4,%0), %%xmm0 \n\t"
3167
            "addps  16(%4,%0), %%xmm1 \n\t"
3168
            "movaps %%xmm0,   (%1,%0) \n\t"
3169
            "movaps %%xmm1, 16(%1,%0) \n\t"
3170
            "sub  $32, %0 \n\t"
3171
            "jge  1b \n\t"
3172
            :"+r"(i)
3173
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3174
            :"memory"
3175
        );
3176
    }
3177
    else
3178
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3179
}
3180

    
3181
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3182
    // not bit-exact: pf2id uses different rounding than C and SSE
3183
    int i;
3184
    for(i=0; i<len; i+=4) {
3185
        asm volatile(
3186
            "pf2id       %1, %%mm0 \n\t"
3187
            "pf2id       %2, %%mm1 \n\t"
3188
            "packssdw %%mm1, %%mm0 \n\t"
3189
            "movq     %%mm0, %0    \n\t"
3190
            :"=m"(dst[i])
3191
            :"m"(src[i]), "m"(src[i+2])
3192
        );
3193
    }
3194
    asm volatile("femms");
3195
}
3196
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3197
    int i;
3198
    for(i=0; i<len; i+=4) {