Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 663deb54

History | View | Annotate | Download (138 KB)

<
1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

    
25
#include "dsputil.h"
26
#include "simple_idct.h"
27
#include "mpegvideo.h"
28
#include "x86_cpu.h"
29
#include "mmx.h"
30
#include "vp3dsp_mmx.h"
31
#include "vp3dsp_sse2.h"
32

    
33
//#undef NDEBUG
34
//#include <assert.h>
35

    
36
extern void ff_idct_xvid_mmx(short *block);
37
extern void ff_idct_xvid_mmx2(short *block);
38

    
39
int mm_flags; /* multimedia extension flags */
40

    
41
/* pixel operations */
42
static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
43
static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
44
static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
45

    
46
static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
47
{0x8000000080000000ULL, 0x8000000080000000ULL};
48

    
49
static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
50
static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
51
static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
52
static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
53
static const uint64_t ff_pw_8  attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
54
static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
55
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
56
static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
57
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
58

    
59
static const uint64_t ff_pb_1  attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
60
static const uint64_t ff_pb_3  attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
61
static const uint64_t ff_pb_7  attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
62
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
63
static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
64
static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
65
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
66

    
67
#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
68
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
69

    
70
#define MOVQ_WONE(regd) \
71
    __asm __volatile ( \
72
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73
    "psrlw $15, %%" #regd ::)
74

    
75
#define MOVQ_BFE(regd) \
76
    __asm __volatile ( \
77
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
78
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
79

    
80
#ifndef PIC
81
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
82
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
83
#else
84
// for shared library it's better to use this way for accessing constants
85
// pcmpeqd -> -1
86
#define MOVQ_BONE(regd) \
87
    __asm __volatile ( \
88
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89
    "psrlw $15, %%" #regd " \n\t" \
90
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
91

    
92
#define MOVQ_WTWO(regd) \
93
    __asm __volatile ( \
94
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95
    "psrlw $15, %%" #regd " \n\t" \
96
    "psllw $1, %%" #regd " \n\t"::)
97

    
98
#endif
99

    
100
// using regr as temporary and for the output result
101
// first argument is unmodifed and second is trashed
102
// regfe is supposed to contain 0xfefefefefefefefe
103
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
104
    "movq " #rega ", " #regr "  \n\t"\
105
    "pand " #regb ", " #regr "  \n\t"\
106
    "pxor " #rega ", " #regb "  \n\t"\
107
    "pand " #regfe "," #regb "  \n\t"\
108
    "psrlq $1, " #regb "        \n\t"\
109
    "paddb " #regb ", " #regr " \n\t"
110

    
111
#define PAVGB_MMX(rega, regb, regr, regfe) \
112
    "movq " #rega ", " #regr "  \n\t"\
113
    "por  " #regb ", " #regr "  \n\t"\
114
    "pxor " #rega ", " #regb "  \n\t"\
115
    "pand " #regfe "," #regb "  \n\t"\
116
    "psrlq $1, " #regb "        \n\t"\
117
    "psubb " #regb ", " #regr " \n\t"
118

    
119
// mm6 is supposed to contain 0xfefefefefefefefe
120
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
121
    "movq " #rega ", " #regr "  \n\t"\
122
    "movq " #regc ", " #regp "  \n\t"\
123
    "pand " #regb ", " #regr "  \n\t"\
124
    "pand " #regd ", " #regp "  \n\t"\
125
    "pxor " #rega ", " #regb "  \n\t"\
126
    "pxor " #regc ", " #regd "  \n\t"\
127
    "pand %%mm6, " #regb "      \n\t"\
128
    "pand %%mm6, " #regd "      \n\t"\
129
    "psrlq $1, " #regb "        \n\t"\
130
    "psrlq $1, " #regd "        \n\t"\
131
    "paddb " #regb ", " #regr " \n\t"\
132
    "paddb " #regd ", " #regp " \n\t"
133

    
134
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
135
    "movq " #rega ", " #regr "  \n\t"\
136
    "movq " #regc ", " #regp "  \n\t"\
137
    "por  " #regb ", " #regr "  \n\t"\
138
    "por  " #regd ", " #regp "  \n\t"\
139
    "pxor " #rega ", " #regb "  \n\t"\
140
    "pxor " #regc ", " #regd "  \n\t"\
141
    "pand %%mm6, " #regb "      \n\t"\
142
    "pand %%mm6, " #regd "      \n\t"\
143
    "psrlq $1, " #regd "        \n\t"\
144
    "psrlq $1, " #regb "        \n\t"\
145
    "psubb " #regb ", " #regr " \n\t"\
146
    "psubb " #regd ", " #regp " \n\t"
147

    
148
/***********************************/
149
/* MMX no rounding */
150
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
151
#define SET_RND  MOVQ_WONE
152
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
153
#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
154

    
155
#include "dsputil_mmx_rnd.h"
156

    
157
#undef DEF
158
#undef SET_RND
159
#undef PAVGBP
160
#undef PAVGB
161
/***********************************/
162
/* MMX rounding */
163

    
164
#define DEF(x, y) x ## _ ## y ##_mmx
165
#define SET_RND  MOVQ_WTWO
166
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
167
#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
168

    
169
#include "dsputil_mmx_rnd.h"
170

    
171
#undef DEF
172
#undef SET_RND
173
#undef PAVGBP
174
#undef PAVGB
175

    
176
/***********************************/
177
/* 3Dnow specific */
178

    
179
#define DEF(x) x ## _3dnow
180
#define PAVGB "pavgusb"
181

    
182
#include "dsputil_mmx_avg.h"
183

    
184
#undef DEF
185
#undef PAVGB
186

    
187
/***********************************/
188
/* MMX2 specific */
189

    
190
#define DEF(x) x ## _mmx2
191

    
192
/* Introduced only in MMX2 set */
193
#define PAVGB "pavgb"
194

    
195
#include "dsputil_mmx_avg.h"
196

    
197
#undef DEF
198
#undef PAVGB
199

    
200
#define SBUTTERFLY(a,b,t,n,m)\
201
    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
202
    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
203
    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
204

    
205
#define TRANSPOSE4(a,b,c,d,t)\
206
    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
207
    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
208
    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
209
    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
210

    
211
/***********************************/
212
/* standard MMX */
213

    
214
#ifdef CONFIG_ENCODERS
215
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
216
{
217
    asm volatile(
218
        "mov $-128, %%"REG_a"           \n\t"
219
        "pxor %%mm7, %%mm7              \n\t"
220
        ASMALIGN(4)
221
        "1:                             \n\t"
222
        "movq (%0), %%mm0               \n\t"
223
        "movq (%0, %2), %%mm2           \n\t"
224
        "movq %%mm0, %%mm1              \n\t"
225
        "movq %%mm2, %%mm3              \n\t"
226
        "punpcklbw %%mm7, %%mm0         \n\t"
227
        "punpckhbw %%mm7, %%mm1         \n\t"
228
        "punpcklbw %%mm7, %%mm2         \n\t"
229
        "punpckhbw %%mm7, %%mm3         \n\t"
230
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
231
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
232
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
233
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
234
        "add %3, %0                     \n\t"
235
        "add $32, %%"REG_a"             \n\t"
236
        "js 1b                          \n\t"
237
        : "+r" (pixels)
238
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
239
        : "%"REG_a
240
    );
241
}
242

    
243
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
244
{
245
    asm volatile(
246
        "pxor %%mm7, %%mm7              \n\t"
247
        "mov $-128, %%"REG_a"           \n\t"
248
        ASMALIGN(4)
249
        "1:                             \n\t"
250
        "movq (%0), %%mm0               \n\t"
251
        "movq (%1), %%mm2               \n\t"
252
        "movq %%mm0, %%mm1              \n\t"
253
        "movq %%mm2, %%mm3              \n\t"
254
        "punpcklbw %%mm7, %%mm0         \n\t"
255
        "punpckhbw %%mm7, %%mm1         \n\t"
256
        "punpcklbw %%mm7, %%mm2         \n\t"
257
        "punpckhbw %%mm7, %%mm3         \n\t"
258
        "psubw %%mm2, %%mm0             \n\t"
259
        "psubw %%mm3, %%mm1             \n\t"
260
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
261
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
262
        "add %3, %0                     \n\t"
263
        "add %3, %1                     \n\t"
264
        "add $16, %%"REG_a"             \n\t"
265
        "jnz 1b                         \n\t"
266
        : "+r" (s1), "+r" (s2)
267
        : "r" (block+64), "r" ((long)stride)
268
        : "%"REG_a
269
    );
270
}
271
#endif //CONFIG_ENCODERS
272

    
273
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
274
{
275
    const DCTELEM *p;
276
    uint8_t *pix;
277

    
278
    /* read the pixels */
279
    p = block;
280
    pix = pixels;
281
    /* unrolled loop */
282
        __asm __volatile(
283
                "movq   %3, %%mm0               \n\t"
284
                "movq   8%3, %%mm1              \n\t"
285
                "movq   16%3, %%mm2             \n\t"
286
                "movq   24%3, %%mm3             \n\t"
287
                "movq   32%3, %%mm4             \n\t"
288
                "movq   40%3, %%mm5             \n\t"
289
                "movq   48%3, %%mm6             \n\t"
290
                "movq   56%3, %%mm7             \n\t"
291
                "packuswb %%mm1, %%mm0          \n\t"
292
                "packuswb %%mm3, %%mm2          \n\t"
293
                "packuswb %%mm5, %%mm4          \n\t"
294
                "packuswb %%mm7, %%mm6          \n\t"
295
                "movq   %%mm0, (%0)             \n\t"
296
                "movq   %%mm2, (%0, %1)         \n\t"
297
                "movq   %%mm4, (%0, %1, 2)      \n\t"
298
                "movq   %%mm6, (%0, %2)         \n\t"
299
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
300
                :"memory");
301
        pix += line_size*4;
302
        p += 32;
303

    
304
    // if here would be an exact copy of the code above
305
    // compiler would generate some very strange code
306
    // thus using "r"
307
    __asm __volatile(
308
            "movq       (%3), %%mm0             \n\t"
309
            "movq       8(%3), %%mm1            \n\t"
310
            "movq       16(%3), %%mm2           \n\t"
311
            "movq       24(%3), %%mm3           \n\t"
312
            "movq       32(%3), %%mm4           \n\t"
313
            "movq       40(%3), %%mm5           \n\t"
314
            "movq       48(%3), %%mm6           \n\t"
315
            "movq       56(%3), %%mm7           \n\t"
316
            "packuswb %%mm1, %%mm0              \n\t"
317
            "packuswb %%mm3, %%mm2              \n\t"
318
            "packuswb %%mm5, %%mm4              \n\t"
319
            "packuswb %%mm7, %%mm6              \n\t"
320
            "movq       %%mm0, (%0)             \n\t"
321
            "movq       %%mm2, (%0, %1)         \n\t"
322
            "movq       %%mm4, (%0, %1, 2)      \n\t"
323
            "movq       %%mm6, (%0, %2)         \n\t"
324
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
325
            :"memory");
326
}
327

    
328
static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
329
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
330

    
331
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
332
{
333
    int i;
334

    
335
    movq_m2r(*vector128, mm1);
336
    for (i = 0; i < 8; i++) {
337
        movq_m2r(*(block), mm0);
338
        packsswb_m2r(*(block + 4), mm0);
339
        block += 8;
340
        paddb_r2r(mm1, mm0);
341
        movq_r2m(mm0, *pixels);
342
        pixels += line_size;
343
    }
344
}
345

    
346
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
347
{
348
    const DCTELEM *p;
349
    uint8_t *pix;
350
    int i;
351

    
352
    /* read the pixels */
353
    p = block;
354
    pix = pixels;
355
    MOVQ_ZERO(mm7);
356
    i = 4;
357
    do {
358
        __asm __volatile(
359
                "movq   (%2), %%mm0     \n\t"
360
                "movq   8(%2), %%mm1    \n\t"
361
                "movq   16(%2), %%mm2   \n\t"
362
                "movq   24(%2), %%mm3   \n\t"
363
                "movq   %0, %%mm4       \n\t"
364
                "movq   %1, %%mm6       \n\t"
365
                "movq   %%mm4, %%mm5    \n\t"
366
                "punpcklbw %%mm7, %%mm4 \n\t"
367
                "punpckhbw %%mm7, %%mm5 \n\t"
368
                "paddsw %%mm4, %%mm0    \n\t"
369
                "paddsw %%mm5, %%mm1    \n\t"
370
                "movq   %%mm6, %%mm5    \n\t"
371
                "punpcklbw %%mm7, %%mm6 \n\t"
372
                "punpckhbw %%mm7, %%mm5 \n\t"
373
                "paddsw %%mm6, %%mm2    \n\t"
374
                "paddsw %%mm5, %%mm3    \n\t"
375
                "packuswb %%mm1, %%mm0  \n\t"
376
                "packuswb %%mm3, %%mm2  \n\t"
377
                "movq   %%mm0, %0       \n\t"
378
                "movq   %%mm2, %1       \n\t"
379
                :"+m"(*pix), "+m"(*(pix+line_size))
380
                :"r"(p)
381
                :"memory");
382
        pix += line_size*2;
383
        p += 16;
384
    } while (--i);
385
}
386

    
387
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
388
{
389
    __asm __volatile(
390
         "lea (%3, %3), %%"REG_a"       \n\t"
391
         ASMALIGN(3)
392
         "1:                            \n\t"
393
         "movd (%1), %%mm0              \n\t"
394
         "movd (%1, %3), %%mm1          \n\t"
395
         "movd %%mm0, (%2)              \n\t"
396
         "movd %%mm1, (%2, %3)          \n\t"
397
         "add %%"REG_a", %1             \n\t"
398
         "add %%"REG_a", %2             \n\t"
399
         "movd (%1), %%mm0              \n\t"
400
         "movd (%1, %3), %%mm1          \n\t"
401
         "movd %%mm0, (%2)              \n\t"
402
         "movd %%mm1, (%2, %3)          \n\t"
403
         "add %%"REG_a", %1             \n\t"
404
         "add %%"REG_a", %2             \n\t"
405
         "subl $4, %0                   \n\t"
406
         "jnz 1b                        \n\t"
407
         : "+g"(h), "+r" (pixels),  "+r" (block)
408
         : "r"((long)line_size)
409
         : "%"REG_a, "memory"
410
        );
411
}
412

    
413
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
414
{
415
    __asm __volatile(
416
         "lea (%3, %3), %%"REG_a"       \n\t"
417
         ASMALIGN(3)
418
         "1:                            \n\t"
419
         "movq (%1), %%mm0              \n\t"
420
         "movq (%1, %3), %%mm1          \n\t"
421
         "movq %%mm0, (%2)              \n\t"
422
         "movq %%mm1, (%2, %3)          \n\t"
423
         "add %%"REG_a", %1             \n\t"
424
         "add %%"REG_a", %2             \n\t"
425
         "movq (%1), %%mm0              \n\t"
426
         "movq (%1, %3), %%mm1          \n\t"
427
         "movq %%mm0, (%2)              \n\t"
428
         "movq %%mm1, (%2, %3)          \n\t"
429
         "add %%"REG_a", %1             \n\t"
430
         "add %%"REG_a", %2             \n\t"
431
         "subl $4, %0                   \n\t"
432
         "jnz 1b                        \n\t"
433
         : "+g"(h), "+r" (pixels),  "+r" (block)
434
         : "r"((long)line_size)
435
         : "%"REG_a, "memory"
436
        );
437
}
438

    
439
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
440
{
441
    __asm __volatile(
442
         "lea (%3, %3), %%"REG_a"       \n\t"
443
         ASMALIGN(3)
444
         "1:                            \n\t"
445
         "movq (%1), %%mm0              \n\t"
446
         "movq 8(%1), %%mm4             \n\t"
447
         "movq (%1, %3), %%mm1          \n\t"
448
         "movq 8(%1, %3), %%mm5         \n\t"
449
         "movq %%mm0, (%2)              \n\t"
450
         "movq %%mm4, 8(%2)             \n\t"
451
         "movq %%mm1, (%2, %3)          \n\t"
452
         "movq %%mm5, 8(%2, %3)         \n\t"
453
         "add %%"REG_a", %1             \n\t"
454
         "add %%"REG_a", %2             \n\t"
455
         "movq (%1), %%mm0              \n\t"
456
         "movq 8(%1), %%mm4             \n\t"
457
         "movq (%1, %3), %%mm1          \n\t"
458
         "movq 8(%1, %3), %%mm5         \n\t"
459
         "movq %%mm0, (%2)              \n\t"
460
         "movq %%mm4, 8(%2)             \n\t"
461
         "movq %%mm1, (%2, %3)          \n\t"
462
         "movq %%mm5, 8(%2, %3)         \n\t"
463
         "add %%"REG_a", %1             \n\t"
464
         "add %%"REG_a", %2             \n\t"
465
         "subl $4, %0                   \n\t"
466
         "jnz 1b                        \n\t"
467
         : "+g"(h), "+r" (pixels),  "+r" (block)
468
         : "r"((long)line_size)
469
         : "%"REG_a, "memory"
470
        );
471
}
472

    
473
static void clear_blocks_mmx(DCTELEM *blocks)
474
{
475
    __asm __volatile(
476
                "pxor %%mm7, %%mm7              \n\t"
477
                "mov $-128*6, %%"REG_a"         \n\t"
478
                "1:                             \n\t"
479
                "movq %%mm7, (%0, %%"REG_a")    \n\t"
480
                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
481
                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
482
                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
483
                "add $32, %%"REG_a"             \n\t"
484
                " js 1b                         \n\t"
485
                : : "r" (((uint8_t *)blocks)+128*6)
486
                : "%"REG_a
487
        );
488
}
489

    
490
#ifdef CONFIG_ENCODERS
491
static int pix_sum16_mmx(uint8_t * pix, int line_size){
492
    const int h=16;
493
    int sum;
494
    long index= -line_size*h;
495

    
496
    __asm __volatile(
497
                "pxor %%mm7, %%mm7              \n\t"
498
                "pxor %%mm6, %%mm6              \n\t"
499
                "1:                             \n\t"
500
                "movq (%2, %1), %%mm0           \n\t"
501
                "movq (%2, %1), %%mm1           \n\t"
502
                "movq 8(%2, %1), %%mm2          \n\t"
503
                "movq 8(%2, %1), %%mm3          \n\t"
504
                "punpcklbw %%mm7, %%mm0         \n\t"
505
                "punpckhbw %%mm7, %%mm1         \n\t"
506
                "punpcklbw %%mm7, %%mm2         \n\t"
507
                "punpckhbw %%mm7, %%mm3         \n\t"
508
                "paddw %%mm0, %%mm1             \n\t"
509
                "paddw %%mm2, %%mm3             \n\t"
510
                "paddw %%mm1, %%mm3             \n\t"
511
                "paddw %%mm3, %%mm6             \n\t"
512
                "add %3, %1                     \n\t"
513
                " js 1b                         \n\t"
514
                "movq %%mm6, %%mm5              \n\t"
515
                "psrlq $32, %%mm6               \n\t"
516
                "paddw %%mm5, %%mm6             \n\t"
517
                "movq %%mm6, %%mm5              \n\t"
518
                "psrlq $16, %%mm6               \n\t"
519
                "paddw %%mm5, %%mm6             \n\t"
520
                "movd %%mm6, %0                 \n\t"
521
                "andl $0xFFFF, %0               \n\t"
522
                : "=&r" (sum), "+r" (index)
523
                : "r" (pix - index), "r" ((long)line_size)
524
        );
525

    
526
        return sum;
527
}
528
#endif //CONFIG_ENCODERS
529

    
530
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
531
    long i=0;
532
    asm volatile(
533
        "1:                             \n\t"
534
        "movq  (%1, %0), %%mm0          \n\t"
535
        "movq  (%2, %0), %%mm1          \n\t"
536
        "paddb %%mm0, %%mm1             \n\t"
537
        "movq %%mm1, (%2, %0)           \n\t"
538
        "movq 8(%1, %0), %%mm0          \n\t"
539
        "movq 8(%2, %0), %%mm1          \n\t"
540
        "paddb %%mm0, %%mm1             \n\t"
541
        "movq %%mm1, 8(%2, %0)          \n\t"
542
        "add $16, %0                    \n\t"
543
        "cmp %3, %0                     \n\t"
544
        " jb 1b                         \n\t"
545
        : "+r" (i)
546
        : "r"(src), "r"(dst), "r"((long)w-15)
547
    );
548
    for(; i<w; i++)
549
        dst[i+0] += src[i+0];
550
}
551

    
552
#define H263_LOOP_FILTER \
553
        "pxor %%mm7, %%mm7              \n\t"\
554
        "movq  %0, %%mm0                \n\t"\
555
        "movq  %0, %%mm1                \n\t"\
556
        "movq  %3, %%mm2                \n\t"\
557
        "movq  %3, %%mm3                \n\t"\
558
        "punpcklbw %%mm7, %%mm0         \n\t"\
559
        "punpckhbw %%mm7, %%mm1         \n\t"\
560
        "punpcklbw %%mm7, %%mm2         \n\t"\
561
        "punpckhbw %%mm7, %%mm3         \n\t"\
562
        "psubw %%mm2, %%mm0             \n\t"\
563
        "psubw %%mm3, %%mm1             \n\t"\
564
        "movq  %1, %%mm2                \n\t"\
565
        "movq  %1, %%mm3                \n\t"\
566
        "movq  %2, %%mm4                \n\t"\
567
        "movq  %2, %%mm5                \n\t"\
568
        "punpcklbw %%mm7, %%mm2         \n\t"\
569
        "punpckhbw %%mm7, %%mm3         \n\t"\
570
        "punpcklbw %%mm7, %%mm4         \n\t"\
571
        "punpckhbw %%mm7, %%mm5         \n\t"\
572
        "psubw %%mm2, %%mm4             \n\t"\
573
        "psubw %%mm3, %%mm5             \n\t"\
574
        "psllw $2, %%mm4                \n\t"\
575
        "psllw $2, %%mm5                \n\t"\
576
        "paddw %%mm0, %%mm4             \n\t"\
577
        "paddw %%mm1, %%mm5             \n\t"\
578
        "pxor %%mm6, %%mm6              \n\t"\
579
        "pcmpgtw %%mm4, %%mm6           \n\t"\
580
        "pcmpgtw %%mm5, %%mm7           \n\t"\
581
        "pxor %%mm6, %%mm4              \n\t"\
582
        "pxor %%mm7, %%mm5              \n\t"\
583
        "psubw %%mm6, %%mm4             \n\t"\
584
        "psubw %%mm7, %%mm5             \n\t"\
585
        "psrlw $3, %%mm4                \n\t"\
586
        "psrlw $3, %%mm5                \n\t"\
587
        "packuswb %%mm5, %%mm4          \n\t"\
588
        "packsswb %%mm7, %%mm6          \n\t"\
589
        "pxor %%mm7, %%mm7              \n\t"\
590
        "movd %4, %%mm2                 \n\t"\
591
        "punpcklbw %%mm2, %%mm2         \n\t"\
592
        "punpcklbw %%mm2, %%mm2         \n\t"\
593
        "punpcklbw %%mm2, %%mm2         \n\t"\
594
        "psubusb %%mm4, %%mm2           \n\t"\
595
        "movq %%mm2, %%mm3              \n\t"\
596
        "psubusb %%mm4, %%mm3           \n\t"\
597
        "psubb %%mm3, %%mm2             \n\t"\
598
        "movq %1, %%mm3                 \n\t"\
599
        "movq %2, %%mm4                 \n\t"\
600
        "pxor %%mm6, %%mm3              \n\t"\
601
        "pxor %%mm6, %%mm4              \n\t"\
602
        "paddusb %%mm2, %%mm3           \n\t"\
603
        "psubusb %%mm2, %%mm4           \n\t"\
604
        "pxor %%mm6, %%mm3              \n\t"\
605
        "pxor %%mm6, %%mm4              \n\t"\
606
        "paddusb %%mm2, %%mm2           \n\t"\
607
        "packsswb %%mm1, %%mm0          \n\t"\
608
        "pcmpgtb %%mm0, %%mm7           \n\t"\
609
        "pxor %%mm7, %%mm0              \n\t"\
610
        "psubb %%mm7, %%mm0             \n\t"\
611
        "movq %%mm0, %%mm1              \n\t"\
612
        "psubusb %%mm2, %%mm0           \n\t"\
613
        "psubb %%mm0, %%mm1             \n\t"\
614
        "pand %5, %%mm1                 \n\t"\
615
        "psrlw $2, %%mm1                \n\t"\
616
        "pxor %%mm7, %%mm1              \n\t"\
617
        "psubb %%mm7, %%mm1             \n\t"\
618
        "movq %0, %%mm5                 \n\t"\
619
        "movq %3, %%mm6                 \n\t"\
620
        "psubb %%mm1, %%mm5             \n\t"\
621
        "paddb %%mm1, %%mm6             \n\t"
622

    
623
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
624
    const int strength= ff_h263_loop_filter_strength[qscale];
625

    
626
    asm volatile(
627

    
628
        H263_LOOP_FILTER
629

    
630
        "movq %%mm3, %1                 \n\t"
631
        "movq %%mm4, %2                 \n\t"
632
        "movq %%mm5, %0                 \n\t"
633
        "movq %%mm6, %3                 \n\t"
634
        : "+m" (*(uint64_t*)(src - 2*stride)),
635
          "+m" (*(uint64_t*)(src - 1*stride)),
636
          "+m" (*(uint64_t*)(src + 0*stride)),
637
          "+m" (*(uint64_t*)(src + 1*stride))
638
        : "g" (2*strength), "m"(ff_pb_FC)
639
    );
640
}
641

    
642
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
643
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
644
        "movd  %4, %%mm0                \n\t"
645
        "movd  %5, %%mm1                \n\t"
646
        "movd  %6, %%mm2                \n\t"
647
        "movd  %7, %%mm3                \n\t"
648
        "punpcklbw %%mm1, %%mm0         \n\t"
649
        "punpcklbw %%mm3, %%mm2         \n\t"
650
        "movq %%mm0, %%mm1              \n\t"
651
        "punpcklwd %%mm2, %%mm0         \n\t"
652
        "punpckhwd %%mm2, %%mm1         \n\t"
653
        "movd  %%mm0, %0                \n\t"
654
        "punpckhdq %%mm0, %%mm0         \n\t"
655
        "movd  %%mm0, %1                \n\t"
656
        "movd  %%mm1, %2                \n\t"
657
        "punpckhdq %%mm1, %%mm1         \n\t"
658
        "movd  %%mm1, %3                \n\t"
659

    
660
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
661
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
662
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
663
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
664
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
665
           "m" (*(uint32_t*)(src + 1*src_stride)),
666
           "m" (*(uint32_t*)(src + 2*src_stride)),
667
           "m" (*(uint32_t*)(src + 3*src_stride))
668
    );
669
}
670

    
671
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
672
    const int strength= ff_h263_loop_filter_strength[qscale];
673
    uint64_t temp[4] __attribute__ ((aligned(8)));
674
    uint8_t *btemp= (uint8_t*)temp;
675

    
676
    src -= 2;
677

    
678
    transpose4x4(btemp  , src           , 8, stride);
679
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
680
    asm volatile(
681
        H263_LOOP_FILTER // 5 3 4 6
682

    
683
        : "+m" (temp[0]),
684
          "+m" (temp[1]),
685
          "+m" (temp[2]),
686
          "+m" (temp[3])
687
        : "g" (2*strength), "m"(ff_pb_FC)
688
    );
689

    
690
    asm volatile(
691
        "movq %%mm5, %%mm1              \n\t"
692
        "movq %%mm4, %%mm0              \n\t"
693
        "punpcklbw %%mm3, %%mm5         \n\t"
694
        "punpcklbw %%mm6, %%mm4         \n\t"
695
        "punpckhbw %%mm3, %%mm1         \n\t"
696
        "punpckhbw %%mm6, %%mm0         \n\t"
697
        "movq %%mm5, %%mm3              \n\t"
698
        "movq %%mm1, %%mm6              \n\t"
699
        "punpcklwd %%mm4, %%mm5         \n\t"
700
        "punpcklwd %%mm0, %%mm1         \n\t"
701
        "punpckhwd %%mm4, %%mm3         \n\t"
702
        "punpckhwd %%mm0, %%mm6         \n\t"
703
        "movd %%mm5, (%0)               \n\t"
704
        "punpckhdq %%mm5, %%mm5         \n\t"
705
        "movd %%mm5, (%0,%2)            \n\t"
706
        "movd %%mm3, (%0,%2,2)          \n\t"
707
        "punpckhdq %%mm3, %%mm3         \n\t"
708
        "movd %%mm3, (%0,%3)            \n\t"
709
        "movd %%mm1, (%1)               \n\t"
710
        "punpckhdq %%mm1, %%mm1         \n\t"
711
        "movd %%mm1, (%1,%2)            \n\t"
712
        "movd %%mm6, (%1,%2,2)          \n\t"
713
        "punpckhdq %%mm6, %%mm6         \n\t"
714
        "movd %%mm6, (%1,%3)            \n\t"
715
        :: "r" (src),
716
           "r" (src + 4*stride),
717
           "r" ((long)   stride ),
718
           "r" ((long)(3*stride))
719
    );
720
}
721

    
722
#ifdef CONFIG_ENCODERS
723
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
724
    int tmp;
725
  asm volatile (
726
      "movl $16,%%ecx\n"
727
      "pxor %%mm0,%%mm0\n"
728
      "pxor %%mm7,%%mm7\n"
729
      "1:\n"
730
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
731
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
732

    
733
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
734

    
735
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
736
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
737

    
738
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
739
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
740
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
741

    
742
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
743
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
744

    
745
      "pmaddwd %%mm3,%%mm3\n"
746
      "pmaddwd %%mm4,%%mm4\n"
747

    
748
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
749
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
750
      "paddd %%mm3,%%mm4\n"
751
      "paddd %%mm2,%%mm7\n"
752

    
753
      "add %2, %0\n"
754
      "paddd %%mm4,%%mm7\n"
755
      "dec %%ecx\n"
756
      "jnz 1b\n"
757

    
758
      "movq %%mm7,%%mm1\n"
759
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
760
      "paddd %%mm7,%%mm1\n"
761
      "movd %%mm1,%1\n"
762
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
763
    return tmp;
764
}
765

    
766
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
767
    int tmp;
768
  asm volatile (
769
      "movl %4,%%ecx\n"
770
      "shr $1,%%ecx\n"
771
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
772
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
773
      "1:\n"
774
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
775
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
776
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
777
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
778

    
779
      /* todo: mm1-mm2, mm3-mm4 */
780
      /* algo: substract mm1 from mm2 with saturation and vice versa */
781
      /*       OR the results to get absolute difference */
782
      "movq %%mm1,%%mm5\n"
783
      "movq %%mm3,%%mm6\n"
784
      "psubusb %%mm2,%%mm1\n"
785
      "psubusb %%mm4,%%mm3\n"
786
      "psubusb %%mm5,%%mm2\n"
787
      "psubusb %%mm6,%%mm4\n"
788

    
789
      "por %%mm1,%%mm2\n"
790
      "por %%mm3,%%mm4\n"
791

    
792
      /* now convert to 16-bit vectors so we can square them */
793
      "movq %%mm2,%%mm1\n"
794
      "movq %%mm4,%%mm3\n"
795

    
796
      "punpckhbw %%mm0,%%mm2\n"
797
      "punpckhbw %%mm0,%%mm4\n"
798
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
799
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
800

    
801
      "pmaddwd %%mm2,%%mm2\n"
802
      "pmaddwd %%mm4,%%mm4\n"
803
      "pmaddwd %%mm1,%%mm1\n"
804
      "pmaddwd %%mm3,%%mm3\n"
805

    
806
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
807
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
808

    
809
      "paddd %%mm2,%%mm1\n"
810
      "paddd %%mm4,%%mm3\n"
811
      "paddd %%mm1,%%mm7\n"
812
      "paddd %%mm3,%%mm7\n"
813

    
814
      "decl %%ecx\n"
815
      "jnz 1b\n"
816

    
817
      "movq %%mm7,%%mm1\n"
818
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
819
      "paddd %%mm7,%%mm1\n"
820
      "movd %%mm1,%2\n"
821
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
822
      : "r" ((long)line_size) , "m" (h)
823
      : "%ecx");
824
    return tmp;
825
}
826

    
827
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
828
    int tmp;
829
  asm volatile (
830
      "movl %4,%%ecx\n"
831
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
832
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
833
      "1:\n"
834
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
835
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
836
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
837
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
838

    
839
      /* todo: mm1-mm2, mm3-mm4 */
840
      /* algo: substract mm1 from mm2 with saturation and vice versa */
841
      /*       OR the results to get absolute difference */
842
      "movq %%mm1,%%mm5\n"
843
      "movq %%mm3,%%mm6\n"
844
      "psubusb %%mm2,%%mm1\n"
845
      "psubusb %%mm4,%%mm3\n"
846
      "psubusb %%mm5,%%mm2\n"
847
      "psubusb %%mm6,%%mm4\n"
848

    
849
      "por %%mm1,%%mm2\n"
850
      "por %%mm3,%%mm4\n"
851

    
852
      /* now convert to 16-bit vectors so we can square them */
853
      "movq %%mm2,%%mm1\n"
854
      "movq %%mm4,%%mm3\n"
855

    
856
      "punpckhbw %%mm0,%%mm2\n"
857
      "punpckhbw %%mm0,%%mm4\n"
858
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
859
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
860

    
861
      "pmaddwd %%mm2,%%mm2\n"
862
      "pmaddwd %%mm4,%%mm4\n"
863
      "pmaddwd %%mm1,%%mm1\n"
864
      "pmaddwd %%mm3,%%mm3\n"
865

    
866
      "add %3,%0\n"
867
      "add %3,%1\n"
868

    
869
      "paddd %%mm2,%%mm1\n"
870
      "paddd %%mm4,%%mm3\n"
871
      "paddd %%mm1,%%mm7\n"
872
      "paddd %%mm3,%%mm7\n"
873

    
874
      "decl %%ecx\n"
875
      "jnz 1b\n"
876

    
877
      "movq %%mm7,%%mm1\n"
878
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
879
      "paddd %%mm7,%%mm1\n"
880
      "movd %%mm1,%2\n"
881
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
882
      : "r" ((long)line_size) , "m" (h)
883
      : "%ecx");
884
    return tmp;
885
}
886

    
887
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
888
    int tmp;
889
  asm volatile (
890
      "shr $1,%2\n"
891
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
892
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
893
      "1:\n"
894
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
895
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
896
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
897
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
898

    
899
      /* todo: mm1-mm2, mm3-mm4 */
900
      /* algo: substract mm1 from mm2 with saturation and vice versa */
901
      /*       OR the results to get absolute difference */
902
      "movdqa %%xmm1,%%xmm5\n"
903
      "movdqa %%xmm3,%%xmm6\n"
904
      "psubusb %%xmm2,%%xmm1\n"
905
      "psubusb %%xmm4,%%xmm3\n"
906
      "psubusb %%xmm5,%%xmm2\n"
907
      "psubusb %%xmm6,%%xmm4\n"
908

    
909
      "por %%xmm1,%%xmm2\n"
910
      "por %%xmm3,%%xmm4\n"
911

    
912
      /* now convert to 16-bit vectors so we can square them */
913
      "movdqa %%xmm2,%%xmm1\n"
914
      "movdqa %%xmm4,%%xmm3\n"
915

    
916
      "punpckhbw %%xmm0,%%xmm2\n"
917
      "punpckhbw %%xmm0,%%xmm4\n"
918
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
919
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
920

    
921
      "pmaddwd %%xmm2,%%xmm2\n"
922
      "pmaddwd %%xmm4,%%xmm4\n"
923
      "pmaddwd %%xmm1,%%xmm1\n"
924
      "pmaddwd %%xmm3,%%xmm3\n"
925

    
926
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
927
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
928

    
929
      "paddd %%xmm2,%%xmm1\n"
930
      "paddd %%xmm4,%%xmm3\n"
931
      "paddd %%xmm1,%%xmm7\n"
932
      "paddd %%xmm3,%%xmm7\n"
933

    
934
      "decl %2\n"
935
      "jnz 1b\n"
936

    
937
      "movdqa %%xmm7,%%xmm1\n"
938
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
939
      "paddd %%xmm1,%%xmm7\n"
940
      "movdqa %%xmm7,%%xmm1\n"
941
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
942
      "paddd %%xmm1,%%xmm7\n"
943
      "movd %%xmm7,%3\n"
944
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
945
      : "r" ((long)line_size));
946
    return tmp;
947
}
948

    
949
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
950
    int tmp;
951
  asm volatile (
952
      "movl %3,%%ecx\n"
953
      "pxor %%mm7,%%mm7\n"
954
      "pxor %%mm6,%%mm6\n"
955

    
956
      "movq (%0),%%mm0\n"
957
      "movq %%mm0, %%mm1\n"
958
      "psllq $8, %%mm0\n"
959
      "psrlq $8, %%mm1\n"
960
      "psrlq $8, %%mm0\n"
961
      "movq %%mm0, %%mm2\n"
962
      "movq %%mm1, %%mm3\n"
963
      "punpcklbw %%mm7,%%mm0\n"
964
      "punpcklbw %%mm7,%%mm1\n"
965
      "punpckhbw %%mm7,%%mm2\n"
966
      "punpckhbw %%mm7,%%mm3\n"
967
      "psubw %%mm1, %%mm0\n"
968
      "psubw %%mm3, %%mm2\n"
969

    
970
      "add %2,%0\n"
971

    
972
      "movq (%0),%%mm4\n"
973
      "movq %%mm4, %%mm1\n"
974
      "psllq $8, %%mm4\n"
975
      "psrlq $8, %%mm1\n"
976
      "psrlq $8, %%mm4\n"
977
      "movq %%mm4, %%mm5\n"
978
      "movq %%mm1, %%mm3\n"
979
      "punpcklbw %%mm7,%%mm4\n"
980
      "punpcklbw %%mm7,%%mm1\n"
981
      "punpckhbw %%mm7,%%mm5\n"
982
      "punpckhbw %%mm7,%%mm3\n"
983
      "psubw %%mm1, %%mm4\n"
984
      "psubw %%mm3, %%mm5\n"
985
      "psubw %%mm4, %%mm0\n"
986
      "psubw %%mm5, %%mm2\n"
987
      "pxor %%mm3, %%mm3\n"
988
      "pxor %%mm1, %%mm1\n"
989
      "pcmpgtw %%mm0, %%mm3\n\t"
990
      "pcmpgtw %%mm2, %%mm1\n\t"
991
      "pxor %%mm3, %%mm0\n"
992
      "pxor %%mm1, %%mm2\n"
993
      "psubw %%mm3, %%mm0\n"
994
      "psubw %%mm1, %%mm2\n"
995
      "paddw %%mm0, %%mm2\n"
996
      "paddw %%mm2, %%mm6\n"
997

    
998
      "add %2,%0\n"
999
      "1:\n"
1000

    
1001
      "movq (%0),%%mm0\n"
1002
      "movq %%mm0, %%mm1\n"
1003
      "psllq $8, %%mm0\n"
1004
      "psrlq $8, %%mm1\n"
1005
      "psrlq $8, %%mm0\n"
1006
      "movq %%mm0, %%mm2\n"
1007
      "movq %%mm1, %%mm3\n"
1008
      "punpcklbw %%mm7,%%mm0\n"
1009
      "punpcklbw %%mm7,%%mm1\n"
1010
      "punpckhbw %%mm7,%%mm2\n"
1011
      "punpckhbw %%mm7,%%mm3\n"
1012
      "psubw %%mm1, %%mm0\n"
1013
      "psubw %%mm3, %%mm2\n"
1014
      "psubw %%mm0, %%mm4\n"
1015
      "psubw %%mm2, %%mm5\n"
1016
      "pxor %%mm3, %%mm3\n"
1017
      "pxor %%mm1, %%mm1\n"
1018
      "pcmpgtw %%mm4, %%mm3\n\t"
1019
      "pcmpgtw %%mm5, %%mm1\n\t"
1020
      "pxor %%mm3, %%mm4\n"
1021
      "pxor %%mm1, %%mm5\n"
1022
      "psubw %%mm3, %%mm4\n"
1023
      "psubw %%mm1, %%mm5\n"
1024
      "paddw %%mm4, %%mm5\n"
1025
      "paddw %%mm5, %%mm6\n"
1026

    
1027
      "add %2,%0\n"
1028

    
1029
      "movq (%0),%%mm4\n"
1030
      "movq %%mm4, %%mm1\n"
1031
      "psllq $8, %%mm4\n"
1032
      "psrlq $8, %%mm1\n"
1033
      "psrlq $8, %%mm4\n"
1034
      "movq %%mm4, %%mm5\n"
1035
      "movq %%mm1, %%mm3\n"
1036
      "punpcklbw %%mm7,%%mm4\n"
1037
      "punpcklbw %%mm7,%%mm1\n"
1038
      "punpckhbw %%mm7,%%mm5\n"
1039
      "punpckhbw %%mm7,%%mm3\n"
1040
      "psubw %%mm1, %%mm4\n"
1041
      "psubw %%mm3, %%mm5\n"
1042
      "psubw %%mm4, %%mm0\n"
1043
      "psubw %%mm5, %%mm2\n"
1044
      "pxor %%mm3, %%mm3\n"
1045
      "pxor %%mm1, %%mm1\n"
1046
      "pcmpgtw %%mm0, %%mm3\n\t"
1047
      "pcmpgtw %%mm2, %%mm1\n\t"
1048
      "pxor %%mm3, %%mm0\n"
1049
      "pxor %%mm1, %%mm2\n"
1050
      "psubw %%mm3, %%mm0\n"
1051
      "psubw %%mm1, %%mm2\n"
1052
      "paddw %%mm0, %%mm2\n"
1053
      "paddw %%mm2, %%mm6\n"
1054

    
1055
      "add %2,%0\n"
1056
      "subl $2, %%ecx\n"
1057
      " jnz 1b\n"
1058

    
1059
      "movq %%mm6, %%mm0\n"
1060
      "punpcklwd %%mm7,%%mm0\n"
1061
      "punpckhwd %%mm7,%%mm6\n"
1062
      "paddd %%mm0, %%mm6\n"
1063

    
1064
      "movq %%mm6,%%mm0\n"
1065
      "psrlq $32, %%mm6\n"
1066
      "paddd %%mm6,%%mm0\n"
1067
      "movd %%mm0,%1\n"
1068
      : "+r" (pix1), "=r"(tmp)
1069
      : "r" ((long)line_size) , "g" (h-2)
1070
      : "%ecx");
1071
      return tmp;
1072
}
1073

    
1074
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1075
    int tmp;
1076
    uint8_t * pix= pix1;
1077
  asm volatile (
1078
      "movl %3,%%ecx\n"
1079
      "pxor %%mm7,%%mm7\n"
1080
      "pxor %%mm6,%%mm6\n"
1081

    
1082
      "movq (%0),%%mm0\n"
1083
      "movq 1(%0),%%mm1\n"
1084
      "movq %%mm0, %%mm2\n"
1085
      "movq %%mm1, %%mm3\n"
1086
      "punpcklbw %%mm7,%%mm0\n"
1087
      "punpcklbw %%mm7,%%mm1\n"
1088
      "punpckhbw %%mm7,%%mm2\n"
1089
      "punpckhbw %%mm7,%%mm3\n"
1090
      "psubw %%mm1, %%mm0\n"
1091
      "psubw %%mm3, %%mm2\n"
1092

    
1093
      "add %2,%0\n"
1094

    
1095
      "movq (%0),%%mm4\n"
1096
      "movq 1(%0),%%mm1\n"
1097
      "movq %%mm4, %%mm5\n"
1098
      "movq %%mm1, %%mm3\n"
1099
      "punpcklbw %%mm7,%%mm4\n"
1100
      "punpcklbw %%mm7,%%mm1\n"
1101
      "punpckhbw %%mm7,%%mm5\n"
1102
      "punpckhbw %%mm7,%%mm3\n"
1103
      "psubw %%mm1, %%mm4\n"
1104
      "psubw %%mm3, %%mm5\n"
1105
      "psubw %%mm4, %%mm0\n"
1106
      "psubw %%mm5, %%mm2\n"
1107
      "pxor %%mm3, %%mm3\n"
1108
      "pxor %%mm1, %%mm1\n"
1109
      "pcmpgtw %%mm0, %%mm3\n\t"
1110
      "pcmpgtw %%mm2, %%mm1\n\t"
1111
      "pxor %%mm3, %%mm0\n"
1112
      "pxor %%mm1, %%mm2\n"
1113
      "psubw %%mm3, %%mm0\n"
1114
      "psubw %%mm1, %%mm2\n"
1115
      "paddw %%mm0, %%mm2\n"
1116
      "paddw %%mm2, %%mm6\n"
1117

    
1118
      "add %2,%0\n"
1119
      "1:\n"
1120

    
1121
      "movq (%0),%%mm0\n"
1122
      "movq 1(%0),%%mm1\n"
1123
      "movq %%mm0, %%mm2\n"
1124
      "movq %%mm1, %%mm3\n"
1125
      "punpcklbw %%mm7,%%mm0\n"
1126
      "punpcklbw %%mm7,%%mm1\n"
1127
      "punpckhbw %%mm7,%%mm2\n"
1128
      "punpckhbw %%mm7,%%mm3\n"
1129
      "psubw %%mm1, %%mm0\n"
1130
      "psubw %%mm3, %%mm2\n"
1131
      "psubw %%mm0, %%mm4\n"
1132
      "psubw %%mm2, %%mm5\n"
1133
      "pxor %%mm3, %%mm3\n"
1134
      "pxor %%mm1, %%mm1\n"
1135
      "pcmpgtw %%mm4, %%mm3\n\t"
1136
      "pcmpgtw %%mm5, %%mm1\n\t"
1137
      "pxor %%mm3, %%mm4\n"
1138
      "pxor %%mm1, %%mm5\n"
1139
      "psubw %%mm3, %%mm4\n"
1140
      "psubw %%mm1, %%mm5\n"
1141
      "paddw %%mm4, %%mm5\n"
1142
      "paddw %%mm5, %%mm6\n"
1143

    
1144
      "add %2,%0\n"
1145

    
1146
      "movq (%0),%%mm4\n"
1147
      "movq 1(%0),%%mm1\n"
1148
      "movq %%mm4, %%mm5\n"
1149
      "movq %%mm1, %%mm3\n"
1150
      "punpcklbw %%mm7,%%mm4\n"
1151
      "punpcklbw %%mm7,%%mm1\n"
1152
      "punpckhbw %%mm7,%%mm5\n"
1153
      "punpckhbw %%mm7,%%mm3\n"
1154
      "psubw %%mm1, %%mm4\n"
1155
      "psubw %%mm3, %%mm5\n"
1156
      "psubw %%mm4, %%mm0\n"
1157
      "psubw %%mm5, %%mm2\n"
1158
      "pxor %%mm3, %%mm3\n"
1159
      "pxor %%mm1, %%mm1\n"
1160
      "pcmpgtw %%mm0, %%mm3\n\t"
1161
      "pcmpgtw %%mm2, %%mm1\n\t"
1162
      "pxor %%mm3, %%mm0\n"
1163
      "pxor %%mm1, %%mm2\n"
1164
      "psubw %%mm3, %%mm0\n"
1165
      "psubw %%mm1, %%mm2\n"
1166
      "paddw %%mm0, %%mm2\n"
1167
      "paddw %%mm2, %%mm6\n"
1168

    
1169
      "add %2,%0\n"
1170
      "subl $2, %%ecx\n"
1171
      " jnz 1b\n"
1172

    
1173
      "movq %%mm6, %%mm0\n"
1174
      "punpcklwd %%mm7,%%mm0\n"
1175
      "punpckhwd %%mm7,%%mm6\n"
1176
      "paddd %%mm0, %%mm6\n"
1177

    
1178
      "movq %%mm6,%%mm0\n"
1179
      "psrlq $32, %%mm6\n"
1180
      "paddd %%mm6,%%mm0\n"
1181
      "movd %%mm0,%1\n"
1182
      : "+r" (pix1), "=r"(tmp)
1183
      : "r" ((long)line_size) , "g" (h-2)
1184
      : "%ecx");
1185
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1186
}
1187

    
1188
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1189
    MpegEncContext *c = p;
1190
    int score1, score2;
1191

    
1192
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1193
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1194
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1195

    
1196
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1197
    else  return score1 + FFABS(score2)*8;
1198
}
1199

    
1200
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1201
    MpegEncContext *c = p;
1202
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1203
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1204

    
1205
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1206
    else  return score1 + FFABS(score2)*8;
1207
}
1208

    
1209
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1210
    int tmp;
1211

    
1212
    assert( (((int)pix) & 7) == 0);
1213
    assert((line_size &7) ==0);
1214

    
1215
#define SUM(in0, in1, out0, out1) \
1216
      "movq (%0), %%mm2\n"\
1217
      "movq 8(%0), %%mm3\n"\
1218
      "add %2,%0\n"\
1219
      "movq %%mm2, " #out0 "\n"\
1220
      "movq %%mm3, " #out1 "\n"\
1221
      "psubusb " #in0 ", %%mm2\n"\
1222
      "psubusb " #in1 ", %%mm3\n"\
1223
      "psubusb " #out0 ", " #in0 "\n"\
1224
      "psubusb " #out1 ", " #in1 "\n"\
1225
      "por %%mm2, " #in0 "\n"\
1226
      "por %%mm3, " #in1 "\n"\
1227
      "movq " #in0 ", %%mm2\n"\
1228
      "movq " #in1 ", %%mm3\n"\
1229
      "punpcklbw %%mm7, " #in0 "\n"\
1230
      "punpcklbw %%mm7, " #in1 "\n"\
1231
      "punpckhbw %%mm7, %%mm2\n"\
1232
      "punpckhbw %%mm7, %%mm3\n"\
1233
      "paddw " #in1 ", " #in0 "\n"\
1234
      "paddw %%mm3, %%mm2\n"\
1235
      "paddw %%mm2, " #in0 "\n"\
1236
      "paddw " #in0 ", %%mm6\n"
1237

    
1238

    
1239
  asm volatile (
1240
      "movl %3,%%ecx\n"
1241
      "pxor %%mm6,%%mm6\n"
1242
      "pxor %%mm7,%%mm7\n"
1243
      "movq (%0),%%mm0\n"
1244
      "movq 8(%0),%%mm1\n"
1245
      "add %2,%0\n"
1246
      "subl $2, %%ecx\n"
1247
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1248
      "1:\n"
1249

    
1250
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1251

    
1252
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1253

    
1254
      "subl $2, %%ecx\n"
1255
      "jnz 1b\n"
1256

    
1257
      "movq %%mm6,%%mm0\n"
1258
      "psrlq $32, %%mm6\n"
1259
      "paddw %%mm6,%%mm0\n"
1260
      "movq %%mm0,%%mm6\n"
1261
      "psrlq $16, %%mm0\n"
1262
      "paddw %%mm6,%%mm0\n"
1263
      "movd %%mm0,%1\n"
1264
      : "+r" (pix), "=r"(tmp)
1265
      : "r" ((long)line_size) , "m" (h)
1266
      : "%ecx");
1267
    return tmp & 0xFFFF;
1268
}
1269
#undef SUM
1270

    
1271
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1272
    int tmp;
1273

    
1274
    assert( (((int)pix) & 7) == 0);
1275
    assert((line_size &7) ==0);
1276

    
1277
#define SUM(in0, in1, out0, out1) \
1278
      "movq (%0), " #out0 "\n"\
1279
      "movq 8(%0), " #out1 "\n"\
1280
      "add %2,%0\n"\
1281
      "psadbw " #out0 ", " #in0 "\n"\
1282
      "psadbw " #out1 ", " #in1 "\n"\
1283
      "paddw " #in1 ", " #in0 "\n"\
1284
      "paddw " #in0 ", %%mm6\n"
1285

    
1286
  asm volatile (
1287
      "movl %3,%%ecx\n"
1288
      "pxor %%mm6,%%mm6\n"
1289
      "pxor %%mm7,%%mm7\n"
1290
      "movq (%0),%%mm0\n"
1291
      "movq 8(%0),%%mm1\n"
1292
      "add %2,%0\n"
1293
      "subl $2, %%ecx\n"
1294
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1295
      "1:\n"
1296

    
1297
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1298

    
1299
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1300

    
1301
      "subl $2, %%ecx\n"
1302
      "jnz 1b\n"
1303

    
1304
      "movd %%mm6,%1\n"
1305
      : "+r" (pix), "=r"(tmp)
1306
      : "r" ((long)line_size) , "m" (h)
1307
      : "%ecx");
1308
    return tmp;
1309
}
1310
#undef SUM
1311

    
1312
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1313
    int tmp;
1314

    
1315
    assert( (((int)pix1) & 7) == 0);
1316
    assert( (((int)pix2) & 7) == 0);
1317
    assert((line_size &7) ==0);
1318

    
1319
#define SUM(in0, in1, out0, out1) \
1320
      "movq (%0),%%mm2\n"\
1321
      "movq (%1)," #out0 "\n"\
1322
      "movq 8(%0),%%mm3\n"\
1323
      "movq 8(%1)," #out1 "\n"\
1324
      "add %3,%0\n"\
1325
      "add %3,%1\n"\
1326
      "psubb " #out0 ", %%mm2\n"\
1327
      "psubb " #out1 ", %%mm3\n"\
1328
      "pxor %%mm7, %%mm2\n"\
1329
      "pxor %%mm7, %%mm3\n"\
1330
      "movq %%mm2, " #out0 "\n"\
1331
      "movq %%mm3, " #out1 "\n"\
1332
      "psubusb " #in0 ", %%mm2\n"\
1333
      "psubusb " #in1 ", %%mm3\n"\
1334
      "psubusb " #out0 ", " #in0 "\n"\
1335
      "psubusb " #out1 ", " #in1 "\n"\
1336
      "por %%mm2, " #in0 "\n"\
1337
      "por %%mm3, " #in1 "\n"\
1338
      "movq " #in0 ", %%mm2\n"\
1339
      "movq " #in1 ", %%mm3\n"\
1340
      "punpcklbw %%mm7, " #in0 "\n"\
1341
      "punpcklbw %%mm7, " #in1 "\n"\
1342
      "punpckhbw %%mm7, %%mm2\n"\
1343
      "punpckhbw %%mm7, %%mm3\n"\
1344
      "paddw " #in1 ", " #in0 "\n"\
1345
      "paddw %%mm3, %%mm2\n"\
1346
      "paddw %%mm2, " #in0 "\n"\
1347
      "paddw " #in0 ", %%mm6\n"
1348

    
1349

    
1350
  asm volatile (
1351
      "movl %4,%%ecx\n"
1352
      "pxor %%mm6,%%mm6\n"
1353
      "pcmpeqw %%mm7,%%mm7\n"
1354
      "psllw $15, %%mm7\n"
1355
      "packsswb %%mm7, %%mm7\n"
1356
      "movq (%0),%%mm0\n"
1357
      "movq (%1),%%mm2\n"
1358
      "movq 8(%0),%%mm1\n"
1359
      "movq 8(%1),%%mm3\n"
1360
      "add %3,%0\n"
1361
      "add %3,%1\n"
1362
      "subl $2, %%ecx\n"
1363
      "psubb %%mm2, %%mm0\n"
1364
      "psubb %%mm3, %%mm1\n"
1365
      "pxor %%mm7, %%mm0\n"
1366
      "pxor %%mm7, %%mm1\n"
1367
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1368
      "1:\n"
1369

    
1370
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1371

    
1372
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1373

    
1374
      "subl $2, %%ecx\n"
1375
      "jnz 1b\n"
1376

    
1377
      "movq %%mm6,%%mm0\n"
1378
      "psrlq $32, %%mm6\n"
1379
      "paddw %%mm6,%%mm0\n"
1380
      "movq %%mm0,%%mm6\n"
1381
      "psrlq $16, %%mm0\n"
1382
      "paddw %%mm6,%%mm0\n"
1383
      "movd %%mm0,%2\n"
1384
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1385
      : "r" ((long)line_size) , "m" (h)
1386
      : "%ecx");
1387
    return tmp & 0x7FFF;
1388
}
1389
#undef SUM
1390

    
1391
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1392
    int tmp;
1393

    
1394
    assert( (((int)pix1) & 7) == 0);
1395
    assert( (((int)pix2) & 7) == 0);
1396
    assert((line_size &7) ==0);
1397

    
1398
#define SUM(in0, in1, out0, out1) \
1399
      "movq (%0)," #out0 "\n"\
1400
      "movq (%1),%%mm2\n"\
1401
      "movq 8(%0)," #out1 "\n"\
1402
      "movq 8(%1),%%mm3\n"\
1403
      "add %3,%0\n"\
1404
      "add %3,%1\n"\
1405
      "psubb %%mm2, " #out0 "\n"\
1406
      "psubb %%mm3, " #out1 "\n"\
1407
      "pxor %%mm7, " #out0 "\n"\
1408
      "pxor %%mm7, " #out1 "\n"\
1409
      "psadbw " #out0 ", " #in0 "\n"\
1410
      "psadbw " #out1 ", " #in1 "\n"\
1411
      "paddw " #in1 ", " #in0 "\n"\
1412
      "paddw " #in0 ", %%mm6\n"
1413

    
1414
  asm volatile (
1415
      "movl %4,%%ecx\n"
1416
      "pxor %%mm6,%%mm6\n"
1417
      "pcmpeqw %%mm7,%%mm7\n"
1418
      "psllw $15, %%mm7\n"
1419
      "packsswb %%mm7, %%mm7\n"
1420
      "movq (%0),%%mm0\n"
1421
      "movq (%1),%%mm2\n"
1422
      "movq 8(%0),%%mm1\n"
1423
      "movq 8(%1),%%mm3\n"
1424
      "add %3,%0\n"
1425
      "add %3,%1\n"
1426
      "subl $2, %%ecx\n"
1427
      "psubb %%mm2, %%mm0\n"
1428
      "psubb %%mm3, %%mm1\n"
1429
      "pxor %%mm7, %%mm0\n"
1430
      "pxor %%mm7, %%mm1\n"
1431
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1432
      "1:\n"
1433

    
1434
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1435

    
1436
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1437

    
1438
      "subl $2, %%ecx\n"
1439
      "jnz 1b\n"
1440

    
1441
      "movd %%mm6,%2\n"
1442
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1443
      : "r" ((long)line_size) , "m" (h)
1444
      : "%ecx");
1445
    return tmp;
1446
}
1447
#undef SUM
1448

    
1449
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1450
    long i=0;
1451
    asm volatile(
1452
        "1:                             \n\t"
1453
        "movq  (%2, %0), %%mm0          \n\t"
1454
        "movq  (%1, %0), %%mm1          \n\t"
1455
        "psubb %%mm0, %%mm1             \n\t"
1456
        "movq %%mm1, (%3, %0)           \n\t"
1457
        "movq 8(%2, %0), %%mm0          \n\t"
1458
        "movq 8(%1, %0), %%mm1          \n\t"
1459
        "psubb %%mm0, %%mm1             \n\t"
1460
        "movq %%mm1, 8(%3, %0)          \n\t"
1461
        "add $16, %0                    \n\t"
1462
        "cmp %4, %0                     \n\t"
1463
        " jb 1b                         \n\t"
1464
        : "+r" (i)
1465
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1466
    );
1467
    for(; i<w; i++)
1468
        dst[i+0] = src1[i+0]-src2[i+0];
1469
}
1470

    
1471
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1472
    long i=0;
1473
    uint8_t l, lt;
1474

    
1475
    asm volatile(
1476
        "1:                             \n\t"
1477
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1478
        "movq  (%1, %0), %%mm1          \n\t" // T
1479
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1480
        "movq  (%2, %0), %%mm3          \n\t" // X
1481
        "movq %%mm2, %%mm4              \n\t" // L
1482
        "psubb %%mm0, %%mm2             \n\t"
1483
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1484
        "movq %%mm4, %%mm5              \n\t" // L
1485
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1486
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1487
        "pminub %%mm2, %%mm4            \n\t"
1488
        "pmaxub %%mm1, %%mm4            \n\t"
1489
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1490
        "movq %%mm3, (%3, %0)           \n\t"
1491
        "add $8, %0                     \n\t"
1492
        "cmp %4, %0                     \n\t"
1493
        " jb 1b                         \n\t"
1494
        : "+r" (i)
1495
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1496
    );
1497

    
1498
    l= *left;
1499
    lt= *left_top;
1500

    
1501
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1502

    
1503
    *left_top= src1[w-1];
1504
    *left    = src2[w-1];
1505
}
1506

    
1507
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1508
    "mov"#m" "#p1", "#a"              \n\t"\
1509
    "mov"#m" "#p2", "#t"              \n\t"\
1510
    "punpcklbw "#a", "#t"             \n\t"\
1511
    "punpcklbw "#a", "#a"             \n\t"\
1512
    "psubw     "#t", "#a"             \n\t"\
1513

    
1514
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1515
    uint8_t *p1b=p1, *p2b=p2;\
1516
    asm volatile(\
1517
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1518
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1519
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1520
        "add %4, %1                   \n\t"\
1521
        "add %4, %2                   \n\t"\
1522
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1523
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1524
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1525
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1526
        "mov"#m1" "#mm"0, %0          \n\t"\
1527
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1528
        "mov"#m1" %0, "#mm"0          \n\t"\
1529
        : "=m"(temp), "+r"(p1b), "+r"(p2b)\
1530
        : "r"((long)stride), "r"((long)stride*3)\
1531
    );\
1532
}
1533

    
1534
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1535
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1536

    
1537
#ifdef ARCH_X86_64
1538
// permutes 01234567 -> 05736421
1539
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1540
    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1541
    SBUTTERFLY(c,d,b,wd,dqa)\
1542
    SBUTTERFLY(e,f,d,wd,dqa)\
1543
    SBUTTERFLY(g,h,f,wd,dqa)\
1544
    SBUTTERFLY(a,c,h,dq,dqa)\
1545
    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1546
    SBUTTERFLY(e,g,b,dq,dqa)\
1547
    SBUTTERFLY(d,f,g,dq,dqa)\
1548
    SBUTTERFLY(a,e,f,qdq,dqa)\
1549
    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1550
    SBUTTERFLY(h,b,d,qdq,dqa)\
1551
    SBUTTERFLY(c,g,b,qdq,dqa)\
1552
    "movdqa %%xmm8, "#g"              \n\t"
1553
#else
1554
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1555
    "movdqa "#h", "#t"                \n\t"\
1556
    SBUTTERFLY(a,b,h,wd,dqa)\
1557
    "movdqa "#h", 16"#t"              \n\t"\
1558
    "movdqa "#t", "#h"                \n\t"\
1559
    SBUTTERFLY(c,d,b,wd,dqa)\
1560
    SBUTTERFLY(e,f,d,wd,dqa)\
1561
    SBUTTERFLY(g,h,f,wd,dqa)\
1562
    SBUTTERFLY(a,c,h,dq,dqa)\
1563
    "movdqa "#h", "#t"                \n\t"\
1564
    "movdqa 16"#t", "#h"              \n\t"\
1565
    SBUTTERFLY(h,b,c,dq,dqa)\
1566
    SBUTTERFLY(e,g,b,dq,dqa)\
1567
    SBUTTERFLY(d,f,g,dq,dqa)\
1568
    SBUTTERFLY(a,e,f,qdq,dqa)\
1569
    SBUTTERFLY(h,d,e,qdq,dqa)\
1570
    "movdqa "#h", 16"#t"              \n\t"\
1571
    "movdqa "#t", "#h"                \n\t"\
1572
    SBUTTERFLY(h,b,d,qdq,dqa)\
1573
    SBUTTERFLY(c,g,b,qdq,dqa)\
1574
    "movdqa 16"#t", "#g"              \n\t"
1575
#endif
1576

    
1577
#define LBUTTERFLY2(a1,b1,a2,b2)\
1578
    "paddw " #b1 ", " #a1 "           \n\t"\
1579
    "paddw " #b2 ", " #a2 "           \n\t"\
1580
    "paddw " #b1 ", " #b1 "           \n\t"\
1581
    "paddw " #b2 ", " #b2 "           \n\t"\
1582
    "psubw " #a1 ", " #b1 "           \n\t"\
1583
    "psubw " #a2 ", " #b2 "           \n\t"
1584

    
1585
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1586
        LBUTTERFLY2(m0, m1, m2, m3)\
1587
        LBUTTERFLY2(m4, m5, m6, m7)\
1588
        LBUTTERFLY2(m0, m2, m1, m3)\
1589
        LBUTTERFLY2(m4, m6, m5, m7)\
1590
        LBUTTERFLY2(m0, m4, m1, m5)\
1591
        LBUTTERFLY2(m2, m6, m3, m7)\
1592

    
1593
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1594

    
1595
#define MMABS_MMX(a,z)\
1596
    "pxor " #z ", " #z "              \n\t"\
1597
    "pcmpgtw " #a ", " #z "           \n\t"\
1598
    "pxor " #z ", " #a "              \n\t"\
1599
    "psubw " #z ", " #a "             \n\t"
1600

    
1601
#define MMABS_MMX2(a,z)\
1602
    "pxor " #z ", " #z "              \n\t"\
1603
    "psubw " #a ", " #z "             \n\t"\
1604
    "pmaxsw " #z ", " #a "            \n\t"
1605

    
1606
#define MMABS_SSSE3(a,z)\
1607
    "pabsw " #a ", " #a "             \n\t"
1608

    
1609
#define MMABS_SUM(a,z, sum)\
1610
    MMABS(a,z)\
1611
    "paddusw " #a ", " #sum "         \n\t"
1612

    
1613
#define MMABS_SUM_8x8_NOSPILL\
1614
    MMABS(%%xmm0, %%xmm8)\
1615
    MMABS(%%xmm1, %%xmm9)\
1616
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1617
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1618
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1619
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1620
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1621
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1622
    "paddusw %%xmm1, %%xmm0           \n\t"
1623

    
1624
#ifdef ARCH_X86_64
1625
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1626
#else
1627
#define MMABS_SUM_8x8_SSE2\
1628
    "movdqa %%xmm7, (%1)              \n\t"\
1629
    MMABS(%%xmm0, %%xmm7)\
1630
    MMABS(%%xmm1, %%xmm7)\
1631
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1632
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1633
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1634
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1635
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1636
    "movdqa (%1), %%xmm2              \n\t"\
1637
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1638
    "paddusw %%xmm1, %%xmm0           \n\t"
1639
#endif
1640

    
1641
#define LOAD4(o, a, b, c, d)\
1642
    "movq "#o"(%1),    "#a"           \n\t"\
1643
    "movq "#o"+8(%1),  "#b"           \n\t"\
1644
    "movq "#o"+16(%1), "#c"           \n\t"\
1645
    "movq "#o"+24(%1), "#d"           \n\t"\
1646

    
1647
#define STORE4(o, a, b, c, d)\
1648
    "movq "#a", "#o"(%1)              \n\t"\
1649
    "movq "#b", "#o"+8(%1)            \n\t"\
1650
    "movq "#c", "#o"+16(%1)           \n\t"\
1651
    "movq "#d", "#o"+24(%1)           \n\t"\
1652

    
1653
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1654
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1655
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1656
#define HSUM_MMX(a, t, dst)\
1657
    "movq "#a", "#t"                  \n\t"\
1658
    "psrlq $32, "#a"                  \n\t"\
1659
    "paddusw "#t", "#a"               \n\t"\
1660
    "movq "#a", "#t"                  \n\t"\
1661
    "psrlq $16, "#a"                  \n\t"\
1662
    "paddusw "#t", "#a"               \n\t"\
1663
    "movd "#a", "#dst"                \n\t"\
1664

    
1665
#define HSUM_MMX2(a, t, dst)\
1666
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1667
    "paddusw "#t", "#a"               \n\t"\
1668
    "pshufw $0x01, "#a", "#t"         \n\t"\
1669
    "paddusw "#t", "#a"               \n\t"\
1670
    "movd "#a", "#dst"                \n\t"\
1671

    
1672
#define HSUM_SSE2(a, t, dst)\
1673
    "movhlps "#a", "#t"               \n\t"\
1674
    "paddusw "#t", "#a"               \n\t"\
1675
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1676
    "paddusw "#t", "#a"               \n\t"\
1677
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1678
    "paddusw "#t", "#a"               \n\t"\
1679
    "movd "#a", "#dst"                \n\t"\
1680

    
1681
#define HADAMARD8_DIFF_MMX(cpu) \
1682
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1683
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1684
    int sum;\
1685
\
1686
    assert(h==8);\
1687
\
1688
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1689
\
1690
    asm volatile(\
1691
        HADAMARD48\
1692
\
1693
        "movq %%mm7, 96(%1)             \n\t"\
1694
\
1695
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1696
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1697
\
1698
        "movq 96(%1), %%mm7             \n\t"\
1699
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1700
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1701
\
1702
        : "=r" (sum)\
1703
        : "r"(temp)\
1704
    );\
1705
\
1706
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1707
\
1708
    asm volatile(\
1709
        HADAMARD48\
1710
\
1711
        "movq %%mm7, 96(%1)             \n\t"\
1712
\
1713
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1714
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1715
\
1716
        "movq 96(%1), %%mm7             \n\t"\
1717
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1718
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1719
        "movq %%mm6, %%mm7              \n\t"\
1720
        "movq %%mm0, %%mm6              \n\t"\
1721
\
1722
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1723
\
1724
        HADAMARD48\
1725
        "movq %%mm7, 64(%1)             \n\t"\
1726
        MMABS(%%mm0, %%mm7)\
1727
        MMABS(%%mm1, %%mm7)\
1728
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1729
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1730
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1731
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1732
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1733
        "movq 64(%1), %%mm2             \n\t"\
1734
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1735
        "paddusw %%mm1, %%mm0           \n\t"\
1736
        "movq %%mm0, 64(%1)             \n\t"\
1737
\
1738
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1739
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1740
\
1741
        HADAMARD48\
1742
        "movq %%mm7, (%1)               \n\t"\
1743
        MMABS(%%mm0, %%mm7)\
1744
        MMABS(%%mm1, %%mm7)\
1745
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1746
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1747
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1748
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1749
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1750
        "movq (%1), %%mm2               \n\t"\
1751
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1752
        "paddusw 64(%1), %%mm0          \n\t"\
1753
        "paddusw %%mm1, %%mm0           \n\t"\
1754
\
1755
        HSUM(%%mm0, %%mm1, %0)\
1756
\
1757
        : "=r" (sum)\
1758
        : "r"(temp)\
1759
    );\
1760
    return sum&0xFFFF;\
1761
}\
1762
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1763

    
1764
#define HADAMARD8_DIFF_SSE2(cpu) \
1765
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1766
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1767
    int sum;\
1768
\
1769
    assert(h==8);\
1770
\
1771
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1772
\
1773
    asm volatile(\
1774
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1775
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1776
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1777
        MMABS_SUM_8x8\
1778
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1779
        : "=r" (sum)\
1780
        : "r"(temp)\
1781
    );\
1782
    return sum&0xFFFF;\
1783
}\
1784
WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1785

    
1786
#define MMABS(a,z)         MMABS_MMX(a,z)
1787
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1788
HADAMARD8_DIFF_MMX(mmx)
1789
#undef MMABS
1790
#undef HSUM
1791

    
1792
#define MMABS(a,z)         MMABS_MMX2(a,z)
1793
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1794
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1795
HADAMARD8_DIFF_MMX(mmx2)
1796
HADAMARD8_DIFF_SSE2(sse2)
1797
#undef MMABS
1798
#undef MMABS_SUM_8x8
1799
#undef HSUM
1800

    
1801
#ifdef HAVE_SSSE3
1802
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1803
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1804
HADAMARD8_DIFF_SSE2(ssse3)
1805
#undef MMABS
1806
#undef MMABS_SUM_8x8
1807
#endif
1808

    
1809
#define DCT_SAD4(m,mm,o)\
1810
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1811
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1812
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1813
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1814
    MMABS_SUM(mm##2, mm##6, mm##0)\
1815
    MMABS_SUM(mm##3, mm##7, mm##1)\
1816
    MMABS_SUM(mm##4, mm##6, mm##0)\
1817
    MMABS_SUM(mm##5, mm##7, mm##1)\
1818

    
1819
#define DCT_SAD_MMX\
1820
    "pxor %%mm0, %%mm0                \n\t"\
1821
    "pxor %%mm1, %%mm1                \n\t"\
1822
    DCT_SAD4(q, %%mm, 0)\
1823
    DCT_SAD4(q, %%mm, 8)\
1824
    DCT_SAD4(q, %%mm, 64)\
1825
    DCT_SAD4(q, %%mm, 72)\
1826
    "paddusw %%mm1, %%mm0             \n\t"\
1827
    HSUM(%%mm0, %%mm1, %0)
1828

    
1829
#define DCT_SAD_SSE2\
1830
    "pxor %%xmm0, %%xmm0              \n\t"\
1831
    "pxor %%xmm1, %%xmm1              \n\t"\
1832
    DCT_SAD4(dqa, %%xmm, 0)\
1833
    DCT_SAD4(dqa, %%xmm, 64)\
1834
    "paddusw %%xmm1, %%xmm0           \n\t"\
1835
    HSUM(%%xmm0, %%xmm1, %0)
1836

    
1837
#define DCT_SAD_FUNC(cpu) \
1838
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1839
    int sum;\
1840
    asm volatile(\
1841
        DCT_SAD\
1842
        :"=r"(sum)\
1843
        :"r"(block)\
1844
    );\
1845
    return sum&0xFFFF;\
1846
}
1847

    
1848
#define DCT_SAD       DCT_SAD_MMX
1849
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1850
#define MMABS(a,z)    MMABS_MMX(a,z)
1851
DCT_SAD_FUNC(mmx)
1852
#undef MMABS
1853
#undef HSUM
1854

    
1855
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1856
#define MMABS(a,z)    MMABS_MMX2(a,z)
1857
DCT_SAD_FUNC(mmx2)
1858
#undef HSUM
1859
#undef DCT_SAD
1860

    
1861
#define DCT_SAD       DCT_SAD_SSE2
1862
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1863
DCT_SAD_FUNC(sse2)
1864
#undef MMABS
1865

    
1866
#ifdef HAVE_SSSE3
1867
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1868
DCT_SAD_FUNC(ssse3)
1869
#undef MMABS
1870
#endif
1871
#undef HSUM
1872
#undef DCT_SAD
1873

    
1874
static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
1875
    int sum;
1876
    long i=size;
1877
    asm volatile(
1878
        "pxor %%mm4, %%mm4 \n"
1879
        "1: \n"
1880
        "sub $8, %0 \n"
1881
        "movq (%2,%0), %%mm2 \n"
1882
        "movq (%3,%0,2), %%mm0 \n"
1883
        "movq 8(%3,%0,2), %%mm1 \n"
1884
        "punpckhbw %%mm2, %%mm3 \n"
1885
        "punpcklbw %%mm2, %%mm2 \n"
1886
        "psraw $8, %%mm3 \n"
1887
        "psraw $8, %%mm2 \n"
1888
        "psubw %%mm3, %%mm1 \n"
1889
        "psubw %%mm2, %%mm0 \n"
1890
        "pmaddwd %%mm1, %%mm1 \n"
1891
        "pmaddwd %%mm0, %%mm0 \n"
1892
        "paddd %%mm1, %%mm4 \n"
1893
        "paddd %%mm0, %%mm4 \n"
1894
        "jg 1b \n"
1895
        "movq %%mm4, %%mm3 \n"
1896
        "psrlq $32, %%mm3 \n"
1897
        "paddd %%mm3, %%mm4 \n"
1898
        "movd %%mm4, %1 \n"
1899
        :"+r"(i), "=r"(sum)
1900
        :"r"(pix1), "r"(pix2)
1901
    );
1902
    return sum;
1903
}
1904

    
1905
#endif //CONFIG_ENCODERS
1906

    
1907
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1908
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1909

    
1910
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1911
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
1912
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
1913
        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
1914
        "movq "#in7", " #m3 "             \n\t" /* d */\
1915
        "movq "#in0", %%mm5               \n\t" /* D */\
1916
        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
1917
        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
1918
        "movq "#in1", %%mm5               \n\t" /* C */\
1919
        "movq "#in2", %%mm6               \n\t" /* B */\
1920
        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
1921
        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
1922
        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
1923
        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
1924
        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
1925
        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
1926
        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1927
        "psraw $5, %%mm5                  \n\t"\
1928
        "packuswb %%mm5, %%mm5            \n\t"\
1929
        OP(%%mm5, out, %%mm7, d)
1930

    
1931
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1932
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1933
    uint64_t temp;\
1934
\
1935
    asm volatile(\
1936
        "pxor %%mm7, %%mm7                \n\t"\
1937
        "1:                               \n\t"\
1938
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1939
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1940
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1941
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1942
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1943
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1944
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1945
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1946
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1947
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1948
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1949
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1950
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1951
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1952
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1953
        "paddw %%mm3, %%mm5               \n\t" /* b */\
1954
        "paddw %%mm2, %%mm6               \n\t" /* c */\
1955
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1956
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1957
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1958
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1959
        "paddw %%mm4, %%mm0               \n\t" /* a */\
1960
        "paddw %%mm1, %%mm5               \n\t" /* d */\
1961
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1962
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1963
        "paddw %6, %%mm6                  \n\t"\
1964
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1965
        "psraw $5, %%mm0                  \n\t"\
1966
        "movq %%mm0, %5                   \n\t"\
1967
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1968
        \
1969
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1970
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1971
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1972
        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
1973
        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
1974
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
1975
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
1976
        "paddw %%mm0, %%mm2               \n\t" /* b */\
1977
        "paddw %%mm5, %%mm3               \n\t" /* c */\
1978
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1979
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1980
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1981
        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
1982
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
1983
        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
1984
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1985
        "paddw %%mm2, %%mm1               \n\t" /* a */\
1986
        "paddw %%mm6, %%mm4               \n\t" /* d */\
1987
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1988
        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
1989
        "paddw %6, %%mm1                  \n\t"\
1990
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
1991
        "psraw $5, %%mm3                  \n\t"\
1992
        "movq %5, %%mm1                   \n\t"\
1993
        "packuswb %%mm3, %%mm1            \n\t"\
1994
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
1995
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1996
        \
1997
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
1998
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
1999
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
2000
        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
2001
        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
2002
        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
2003
        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
2004
        "paddw %%mm1, %%mm5               \n\t" /* b */\
2005
        "paddw %%mm4, %%mm0               \n\t" /* c */\
2006
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2007
        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
2008
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
2009
        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
2010
        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
2011
        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
2012
        "paddw %%mm3, %%mm2               \n\t" /* d */\
2013
        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
2014
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
2015
        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
2016
        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
2017
        "paddw %%mm2, %%mm6               \n\t" /* a */\
2018
        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2019
        "paddw %6, %%mm0                  \n\t"\
2020
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2021
        "psraw $5, %%mm0                  \n\t"\
2022
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2023
        \
2024
        "paddw %%mm5, %%mm3               \n\t" /* a */\
2025
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
2026
        "paddw %%mm4, %%mm6               \n\t" /* b */\
2027
        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
2028
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
2029
        "paddw %%mm1, %%mm4               \n\t" /* c */\
2030
        "paddw %%mm2, %%mm5               \n\t" /* d */\
2031
        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
2032
        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
2033
        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2034
        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
2035
        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
2036
        "paddw %6, %%mm4                  \n\t"\
2037
        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
2038
        "psraw $5, %%mm4                  \n\t"\
2039
        "packuswb %%mm4, %%mm0            \n\t"\
2040
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2041
        \
2042
        "add %3, %0                       \n\t"\
2043
        "add %4, %1                       \n\t"\
2044
        "decl %2                          \n\t"\
2045
        " jnz 1b                          \n\t"\
2046
        : "+a"(src), "+c"(dst), "+m"(h)\
2047
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2048
        : "memory"\
2049
    );\
2050
}\
2051
\
2052
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2053
    int i;\
2054
    int16_t temp[16];\
2055
    /* quick HACK, XXX FIXME MUST be optimized */\
2056
    for(i=0; i<h; i++)\
2057
    {\
2058
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2059
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2060
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2061
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2062
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2063
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2064
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2065
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2066
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2067
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2068
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2069
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2070
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2071
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2072
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2073
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2074
        asm volatile(\
2075
            "movq (%0), %%mm0               \n\t"\
2076
            "movq 8(%0), %%mm1              \n\t"\
2077
            "paddw %2, %%mm0                \n\t"\
2078
            "paddw %2, %%mm1                \n\t"\
2079
            "psraw $5, %%mm0                \n\t"\
2080
            "psraw $5, %%mm1                \n\t"\
2081
            "packuswb %%mm1, %%mm0          \n\t"\
2082
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2083
            "movq 16(%0), %%mm0             \n\t"\
2084
            "movq 24(%0), %%mm1             \n\t"\
2085
            "paddw %2, %%mm0                \n\t"\
2086
            "paddw %2, %%mm1                \n\t"\
2087
            "psraw $5, %%mm0                \n\t"\
2088
            "psraw $5, %%mm1                \n\t"\
2089
            "packuswb %%mm1, %%mm0          \n\t"\
2090
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2091
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2092
            : "memory"\
2093
        );\
2094
        dst+=dstStride;\
2095
        src+=srcStride;\
2096
    }\
2097
}\
2098
\
2099
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2100
    uint64_t temp;\
2101
\
2102
    asm volatile(\
2103
        "pxor %%mm7, %%mm7                \n\t"\
2104
        "1:                               \n\t"\
2105
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2106
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2107
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2108
        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
2109
        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
2110
        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
2111
        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
2112
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2113
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2114
        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
2115
        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
2116
        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
2117
        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
2118
        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
2119
        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
2120
        "paddw %%mm3, %%mm5               \n\t" /* b */\
2121
        "paddw %%mm2, %%mm6               \n\t" /* c */\
2122
        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
2123
        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
2124
        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
2125
        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
2126
        "paddw %%mm4, %%mm0               \n\t" /* a */\
2127
        "paddw %%mm1, %%mm5               \n\t" /* d */\
2128
        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2129
        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
2130
        "paddw %6, %%mm6                  \n\t"\
2131
        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
2132
        "psraw $5, %%mm0                  \n\t"\
2133
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2134
        \
2135
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2136
        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
2137
        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
2138
        "paddw %%mm5, %%mm1               \n\t" /* a */\
2139
        "paddw %%mm6, %%mm2               \n\t" /* b */\
2140
        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
2141
        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
2142
        "paddw %%mm6, %%mm3               \n\t" /* c */\
2143
        "paddw %%mm5, %%mm4               \n\t" /* d */\
2144
        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
2145
        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
2146
        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2147
        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
2148
        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
2149
        "paddw %6, %%mm1                  \n\t"\
2150
        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
2151
        "psraw $5, %%mm3                  \n\t"\
2152
        "packuswb %%mm3, %%mm0            \n\t"\
2153
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2154
        \
2155
        "add %3, %0                       \n\t"\
2156
        "add %4, %1                       \n\t"\
2157
        "decl %2                          \n\t"\
2158
        " jnz 1b                          \n\t"\
2159
        : "+a"(src), "+c"(dst), "+m"(h)\
2160
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2161
        : "memory"\
2162
    );\
2163
}\
2164
\
2165
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2166
    int i;\
2167
    int16_t temp[8];\
2168
    /* quick HACK, XXX FIXME MUST be optimized */\
2169
    for(i=0; i<h; i++)\
2170
    {\
2171
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2172
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2173
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2174
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2175
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2176
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2177
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2178
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2179
        asm volatile(\
2180
            "movq (%0), %%mm0           \n\t"\
2181
            "movq 8(%0), %%mm1          \n\t"\
2182
            "paddw %2, %%mm0            \n\t"\
2183
            "paddw %2, %%mm1            \n\t"\
2184
            "psraw $5, %%mm0            \n\t"\
2185
            "psraw $5, %%mm1            \n\t"\
2186
            "packuswb %%mm1, %%mm0      \n\t"\
2187
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2188
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2189
            :"memory"\
2190
        );\
2191
        dst+=dstStride;\
2192
        src+=srcStride;\
2193
    }\
2194
}
2195

    
2196
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2197
\
2198
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2199
    uint64_t temp[17*4];\
2200
    uint64_t *temp_ptr= temp;\
2201
    int count= 17;\
2202
\
2203
    /*FIXME unroll */\
2204
    asm volatile(\
2205
        "pxor %%mm7, %%mm7              \n\t"\
2206
        "1:                             \n\t"\
2207
        "movq (%0), %%mm0               \n\t"\
2208
        "movq (%0), %%mm1               \n\t"\
2209
        "movq 8(%0), %%mm2              \n\t"\
2210
        "movq 8(%0), %%mm3              \n\t"\
2211
        "punpcklbw %%mm7, %%mm0         \n\t"\
2212
        "punpckhbw %%mm7, %%mm1         \n\t"\
2213
        "punpcklbw %%mm7, %%mm2         \n\t"\
2214
        "punpckhbw %%mm7, %%mm3         \n\t"\
2215
        "movq %%mm0, (%1)               \n\t"\
2216
        "movq %%mm1, 17*8(%1)           \n\t"\
2217
        "movq %%mm2, 2*17*8(%1)         \n\t"\
2218
        "movq %%mm3, 3*17*8(%1)         \n\t"\
2219
        "add $8, %1                     \n\t"\
2220
        "add %3, %0                     \n\t"\
2221
        "decl %2                        \n\t"\
2222
        " jnz 1b                        \n\t"\
2223
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2224
        : "r" ((long)srcStride)\
2225
        : "memory"\
2226
    );\
2227
    \
2228
    temp_ptr= temp;\
2229
    count=4;\
2230
    \
2231
/*FIXME reorder for speed */\
2232
    asm volatile(\
2233
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2234
        "1:                             \n\t"\
2235
        "movq (%0), %%mm0               \n\t"\
2236
        "movq 8(%0), %%mm1              \n\t"\
2237
        "movq 16(%0), %%mm2             \n\t"\
2238
        "movq 24(%0), %%mm3             \n\t"\
2239
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2240
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2241
        "add %4, %1                     \n\t"\
2242
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2243
        \
2244
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2245
        "add %4, %1                     \n\t"\
2246
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2247
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2248
        "add %4, %1                     \n\t"\
2249
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2250
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2251
        "add %4, %1                     \n\t"\
2252
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2253
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2254
        "add %4, %1                     \n\t"\
2255
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2256
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2257
        "add %4, %1                     \n\t"\
2258
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2259
        \
2260
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2261
        "add %4, %1                     \n\t"  \
2262
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2263
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2264
        \
2265
        "add $136, %0                   \n\t"\
2266
        "add %6, %1                     \n\t"\
2267
        "decl %2                        \n\t"\
2268
        " jnz 1b                        \n\t"\
2269
        \
2270
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2271
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2272
        :"memory"\
2273
    );\
2274
}\
2275
\
2276
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2277
    uint64_t temp[9*2];\
2278
    uint64_t *temp_ptr= temp;\
2279
    int count= 9;\
2280
\
2281
    /*FIXME unroll */\
2282
    asm volatile(\
2283
        "pxor %%mm7, %%mm7              \n\t"\
2284
        "1:                             \n\t"\
2285
        "movq (%0), %%mm0               \n\t"\
2286
        "movq (%0), %%mm1               \n\t"\
2287
        "punpcklbw %%mm7, %%mm0         \n\t"\
2288
        "punpckhbw %%mm7, %%mm1         \n\t"\
2289
        "movq %%mm0, (%1)               \n\t"\
2290
        "movq %%mm1, 9*8(%1)            \n\t"\
2291
        "add $8, %1                     \n\t"\
2292
        "add %3, %0                     \n\t"\
2293
        "decl %2                        \n\t"\
2294
        " jnz 1b                        \n\t"\
2295
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2296
        : "r" ((long)srcStride)\
2297
        : "memory"\
2298
    );\
2299
    \
2300
    temp_ptr= temp;\
2301
    count=2;\
2302
    \
2303
/*FIXME reorder for speed */\
2304
    asm volatile(\
2305
        /*"pxor %%mm7, %%mm7              \n\t"*/\
2306
        "1:                             \n\t"\
2307
        "movq (%0), %%mm0               \n\t"\
2308
        "movq 8(%0), %%mm1              \n\t"\
2309
        "movq 16(%0), %%mm2             \n\t"\
2310
        "movq 24(%0), %%mm3             \n\t"\
2311
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2312
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2313
        "add %4, %1                     \n\t"\
2314
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2315
        \
2316
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2317
        "add %4, %1                     \n\t"\
2318
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2319
        \
2320
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2321
        "add %4, %1                     \n\t"\
2322
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2323
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2324
                \
2325
        "add $72, %0                    \n\t"\
2326
        "add %6, %1                     \n\t"\
2327
        "decl %2                        \n\t"\
2328
        " jnz 1b                        \n\t"\
2329
         \
2330
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2331
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2332
        : "memory"\
2333
   );\
2334
}\
2335
\
2336
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2337
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2338
}\
2339
\
2340
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2341
    uint64_t temp[8];\
2342
    uint8_t * const half= (uint8_t*)temp;\
2343
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2344
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2345
}\
2346
\
2347
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2348
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2349
}\
2350
\
2351
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2352
    uint64_t temp[8];\
2353
    uint8_t * const half= (uint8_t*)temp;\
2354
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2355
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2356
}\
2357
\
2358
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2359
    uint64_t temp[8];\
2360
    uint8_t * const half= (uint8_t*)temp;\
2361
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2362
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2363
}\
2364
\
2365
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2366
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2367
}\
2368
\
2369
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2370
    uint64_t temp[8];\
2371
    uint8_t * const half= (uint8_t*)temp;\
2372
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2373
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2374
}\
2375
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2376
    uint64_t half[8 + 9];\
2377
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2378
    uint8_t * const halfHV= ((uint8_t*)half);\
2379
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2380
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2381
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2382
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2383
}\
2384
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2385
    uint64_t half[8 + 9];\
2386
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2387
    uint8_t * const halfHV= ((uint8_t*)half);\
2388
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2389
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2390
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2391
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2392
}\
2393
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2394
    uint64_t half[8 + 9];\
2395
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2396
    uint8_t * const halfHV= ((uint8_t*)half);\
2397
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2398
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2399
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2400
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2401
}\
2402
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2403
    uint64_t half[8 + 9];\
2404
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2405
    uint8_t * const halfHV= ((uint8_t*)half);\
2406
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2407
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2408
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2409
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2410
}\
2411
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2412
    uint64_t half[8 + 9];\
2413
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2414
    uint8_t * const halfHV= ((uint8_t*)half);\
2415
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2416
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2417
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2418
}\
2419
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2420
    uint64_t half[8 + 9];\
2421
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2422
    uint8_t * const halfHV= ((uint8_t*)half);\
2423
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2424
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2425
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2426
}\
2427
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2428
    uint64_t half[8 + 9];\
2429
    uint8_t * const halfH= ((uint8_t*)half);\
2430
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2431
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2432
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2433
}\
2434
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2435
    uint64_t half[8 + 9];\
2436
    uint8_t * const halfH= ((uint8_t*)half);\
2437
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2438
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2439
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2440
}\
2441
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2442
    uint64_t half[9];\
2443
    uint8_t * const halfH= ((uint8_t*)half);\
2444
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2445
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2446
}\
2447
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2448
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2449
}\
2450
\
2451
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2452
    uint64_t temp[32];\
2453
    uint8_t * const half= (uint8_t*)temp;\
2454
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2455
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2456
}\
2457
\
2458
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2459
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2460
}\
2461
\
2462
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2463
    uint64_t temp[32];\
2464
    uint8_t * const half= (uint8_t*)temp;\
2465
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2466
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2467
}\
2468
\
2469
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2470
    uint64_t temp[32];\
2471
    uint8_t * const half= (uint8_t*)temp;\
2472
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2473
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2474
}\
2475
\
2476
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2477
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2478
}\
2479
\
2480
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2481
    uint64_t temp[32];\
2482
    uint8_t * const half= (uint8_t*)temp;\
2483
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2484
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2485
}\
2486
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2487
    uint64_t half[16*2 + 17*2];\
2488
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2489
    uint8_t * const halfHV= ((uint8_t*)half);\
2490
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2491
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2492
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2493
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2494
}\
2495
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2496
    uint64_t half[16*2 + 17*2];\
2497
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2498
    uint8_t * const halfHV= ((uint8_t*)half);\
2499
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2500
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2501
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2502
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2503
}\
2504
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2505
    uint64_t half[16*2 + 17*2];\
2506
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2507
    uint8_t * const halfHV= ((uint8_t*)half);\
2508
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2509
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2510
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2511
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2512
}\
2513
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2514
    uint64_t half[16*2 + 17*2];\
2515
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2516
    uint8_t * const halfHV= ((uint8_t*)half);\
2517
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2518
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2519
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2520
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2521
}\
2522
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2523
    uint64_t half[16*2 + 17*2];\
2524
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2525
    uint8_t * const halfHV= ((uint8_t*)half);\
2526
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2527
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2528
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2529
}\
2530
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2531
    uint64_t half[16*2 + 17*2];\
2532
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2533
    uint8_t * const halfHV= ((uint8_t*)half);\
2534
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2535
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2536
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2537
}\
2538
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2539
    uint64_t half[17*2];\
2540
    uint8_t * const halfH= ((uint8_t*)half);\
2541
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2542
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2543
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2544
}\
2545
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2546
    uint64_t half[17*2];\
2547
    uint8_t * const halfH= ((uint8_t*)half);\
2548
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2549
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2550
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2551
}\
2552
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2553
    uint64_t half[17*2];\
2554
    uint8_t * const halfH= ((uint8_t*)half);\
2555
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2556
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2557
}
2558

    
2559
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2560
#define AVG_3DNOW_OP(a,b,temp, size) \
2561
"mov" #size " " #b ", " #temp "   \n\t"\
2562
"pavgusb " #temp ", " #a "        \n\t"\
2563
"mov" #size " " #a ", " #b "      \n\t"
2564
#define AVG_MMX2_OP(a,b,temp, size) \
2565
"mov" #size " " #b ", " #temp "   \n\t"\
2566
"pavgb " #temp ", " #a "          \n\t"\
2567
"mov" #size " " #a ", " #b "      \n\t"
2568

    
2569
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2570
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2571
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2572
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2573
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2574
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2575
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2576
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2577
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2578

    
2579
/***********************************/
2580
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2581

    
2582
#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2583
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2584
    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2585
}
2586
#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2587
static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2588
    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2589
}
2590

    
2591
#define QPEL_2TAP(OPNAME, SIZE, MMX)\
2592
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2593
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2594
QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2595
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2596
                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2597
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2598
                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2599
static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2600
                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2601
static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2602
    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2603
}\
2604
static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2605
    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2606
}\
2607
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
2608
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
2609
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
2610
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
2611
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
2612
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
2613
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
2614
QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2615

    
2616
QPEL_2TAP(put_, 16, mmx2)
2617
QPEL_2TAP(avg_, 16, mmx2)
2618
QPEL_2TAP(put_,  8, mmx2)
2619
QPEL_2TAP(avg_,  8, mmx2)
2620
QPEL_2TAP(put_, 16, 3dnow)
2621
QPEL_2TAP(avg_, 16, 3dnow)
2622
QPEL_2TAP(put_,  8, 3dnow)
2623
QPEL_2TAP(avg_,  8, 3dnow)
2624

    
2625

    
2626
#if 0
2627
static void just_return() { return; }
2628
#endif
2629

    
2630
#define SET_QPEL_FUNC(postfix1, postfix2) \
2631
    c->put_ ## postfix1 = put_ ## postfix2;\
2632
    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2633
    c->avg_ ## postfix1 = avg_ ## postfix2;
2634

    
2635
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2636
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2637
    const int w = 8;
2638
    const int ix = ox>>(16+shift);
2639
    const int iy = oy>>(16+shift);
2640
    const int oxs = ox>>4;
2641
    const int oys = oy>>4;
2642
    const int dxxs = dxx>>4;
2643
    const int dxys = dxy>>4;
2644
    const int dyxs = dyx>>4;
2645
    const int dyys = dyy>>4;
2646
    const uint16_t r4[4] = {r,r,r,r};
2647
    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2648
    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2649
    const uint64_t shift2 = 2*shift;
2650
    uint8_t edge_buf[(h+1)*stride];
2651
    int x, y;
2652

    
2653
    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2654
    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2655
    const int dxh = dxy*(h-1);
2656
    const int dyw = dyx*(w-1);
2657
    if( // non-constant fullpel offset (3% of blocks)
2658
        (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2659
         oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2660
        // uses more than 16 bits of subpel mv (only at huge resolution)
2661
        || (dxx|dxy|dyx|dyy)&15 )
2662
    {
2663
        //FIXME could still use mmx for some of the rows
2664
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2665
        return;
2666
    }
2667

    
2668
    src += ix + iy*stride;
2669
    if( (unsigned)ix >= width-w ||
2670
        (unsigned)iy >= height-h )
2671
    {
2672
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2673
        src = edge_buf;
2674
    }
2675

    
2676
    asm volatile(
2677
        "movd         %0, %%mm6 \n\t"
2678
        "pxor      %%mm7, %%mm7 \n\t"
2679
        "punpcklwd %%mm6, %%mm6 \n\t"
2680
        "punpcklwd %%mm6, %%mm6 \n\t"
2681
        :: "r"(1<<shift)
2682
    );
2683

    
2684
    for(x=0; x<w; x+=4){
2685
        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2686
                            oxs - dxys + dxxs*(x+1),
2687
                            oxs - dxys + dxxs*(x+2),
2688
                            oxs - dxys + dxxs*(x+3) };
2689
        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2690
                            oys - dyys + dyxs*(x+1),
2691
                            oys - dyys + dyxs*(x+2),
2692
                            oys - dyys + dyxs*(x+3) };
2693

    
2694
        for(y=0; y<h; y++){
2695
            asm volatile(
2696
                "movq   %0,  %%mm4 \n\t"
2697
                "movq   %1,  %%mm5 \n\t"
2698
                "paddw  %2,  %%mm4 \n\t"
2699
                "paddw  %3,  %%mm5 \n\t"
2700
                "movq   %%mm4, %0  \n\t"
2701
                "movq   %%mm5, %1  \n\t"
2702
                "psrlw  $12, %%mm4 \n\t"
2703
                "psrlw  $12, %%mm5 \n\t"
2704
                : "+m"(*dx4), "+m"(*dy4)
2705
                : "m"(*dxy4), "m"(*dyy4)
2706
            );
2707

    
2708
            asm volatile(
2709
                "movq   %%mm6, %%mm2 \n\t"
2710
                "movq   %%mm6, %%mm1 \n\t"
2711
                "psubw  %%mm4, %%mm2 \n\t"
2712
                "psubw  %%mm5, %%mm1 \n\t"
2713
                "movq   %%mm2, %%mm0 \n\t"
2714
                "movq   %%mm4, %%mm3 \n\t"
2715
                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2716
                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2717
                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2718
                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2719

    
2720
                "movd   %4,    %%mm5 \n\t"
2721
                "movd   %3,    %%mm4 \n\t"
2722
                "punpcklbw %%mm7, %%mm5 \n\t"
2723
                "punpcklbw %%mm7, %%mm4 \n\t"
2724
                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2725
                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2726

    
2727
                "movd   %2,    %%mm5 \n\t"
2728
                "movd   %1,    %%mm4 \n\t"
2729
                "punpcklbw %%mm7, %%mm5 \n\t"
2730
                "punpcklbw %%mm7, %%mm4 \n\t"
2731
                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2732
                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2733
                "paddw  %5,    %%mm1 \n\t"
2734
                "paddw  %%mm3, %%mm2 \n\t"
2735
                "paddw  %%mm1, %%mm0 \n\t"
2736
                "paddw  %%mm2, %%mm0 \n\t"
2737

    
2738
                "psrlw    %6,    %%mm0 \n\t"
2739
                "packuswb %%mm0, %%mm0 \n\t"
2740
                "movd     %%mm0, %0    \n\t"
2741

    
2742
                : "=m"(dst[x+y*stride])
2743
                : "m"(src[0]), "m"(src[1]),
2744
                  "m"(src[stride]), "m"(src[stride+1]),
2745
                  "m"(*r4), "m"(shift2)
2746
            );
2747
            src += stride;
2748
        }
2749
        src += 4-h*stride;
2750
    }
2751
}
2752

    
2753
#ifdef CONFIG_ENCODERS
2754

    
2755
#define PHADDD(a, t)\
2756
    "movq "#a", "#t"                  \n\t"\
2757
    "psrlq $32, "#a"                  \n\t"\
2758
    "paddd "#t", "#a"                 \n\t"
2759
/*
2760
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2761
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2762
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2763
 */
2764
#define PMULHRW(x, y, s, o)\
2765
    "pmulhw " #s ", "#x "            \n\t"\
2766
    "pmulhw " #s ", "#y "            \n\t"\
2767
    "paddw " #o ", "#x "             \n\t"\
2768
    "paddw " #o ", "#y "             \n\t"\
2769
    "psraw $1, "#x "                 \n\t"\
2770
    "psraw $1, "#y "                 \n\t"
2771
#define DEF(x) x ## _mmx
2772
#define SET_RND MOVQ_WONE
2773
#define SCALE_OFFSET 1
2774

    
2775
#include "dsputil_mmx_qns.h"
2776

    
2777
#undef DEF
2778
#undef SET_RND
2779
#undef SCALE_OFFSET
2780
#undef PMULHRW
2781

    
2782
#define DEF(x) x ## _3dnow
2783
#define SET_RND(x)
2784
#define SCALE_OFFSET 0
2785
#define PMULHRW(x, y, s, o)\
2786
    "pmulhrw " #s ", "#x "           \n\t"\
2787
    "pmulhrw " #s ", "#y "           \n\t"
2788

    
2789
#include "dsputil_mmx_qns.h"
2790

    
2791
#undef DEF
2792
#undef SET_RND
2793
#undef SCALE_OFFSET
2794
#undef PMULHRW
2795

    
2796
#ifdef HAVE_SSSE3
2797
#undef PHADDD
2798
#define DEF(x) x ## _ssse3
2799
#define SET_RND(x)
2800
#define SCALE_OFFSET -1
2801
#define PHADDD(a, t)\
2802
    "pshufw $0x0E, "#a", "#t"         \n\t"\
2803
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
2804
#define PMULHRW(x, y, s, o)\
2805
    "pmulhrsw " #s ", "#x "          \n\t"\
2806
    "pmulhrsw " #s ", "#y "          \n\t"
2807

    
2808
#include "dsputil_mmx_qns.h"
2809

    
2810
#undef DEF
2811
#undef SET_RND
2812
#undef SCALE_OFFSET
2813
#undef PMULHRW
2814
#undef PHADDD
2815
#endif //HAVE_SSSE3
2816

    
2817
#endif /* CONFIG_ENCODERS */
2818

    
2819
#define PREFETCH(name, op) \
2820
static void name(void *mem, int stride, int h){\
2821
    const uint8_t *p= mem;\
2822
    do{\
2823
        asm volatile(#op" %0" :: "m"(*p));\
2824
        p+= stride;\
2825
    }while(--h);\
2826
}
2827
PREFETCH(prefetch_mmx2,  prefetcht0)
2828
PREFETCH(prefetch_3dnow, prefetch)
2829
#undef PREFETCH
2830

    
2831
#include "h264dsp_mmx.c"
2832

    
2833
/* AVS specific */
2834
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2835

    
2836
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2837
    put_pixels8_mmx(dst, src, stride, 8);
2838
}
2839
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2840
    avg_pixels8_mmx(dst, src, stride, 8);
2841
}
2842
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2843
    put_pixels16_mmx(dst, src, stride, 16);
2844
}
2845
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2846
    avg_pixels16_mmx(dst, src, stride, 16);
2847
}
2848

    
2849
/* external functions, from idct_mmx.c */
2850
void ff_mmx_idct(DCTELEM *block);
2851
void ff_mmxext_idct(DCTELEM *block);
2852

    
2853
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2854
   converted */
2855
#ifdef CONFIG_GPL
2856
static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2857
{
2858
    ff_mmx_idct (block);
2859
    put_pixels_clamped_mmx(block, dest, line_size);
2860
}
2861
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2862
{
2863
    ff_mmx_idct (block);
2864
    add_pixels_clamped_mmx(block, dest, line_size);
2865
}
2866
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2867
{
2868
    ff_mmxext_idct (block);
2869
    put_pixels_clamped_mmx(block, dest, line_size);
2870
}
2871
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2872
{
2873
    ff_mmxext_idct (block);
2874
    add_pixels_clamped_mmx(block, dest, line_size);
2875
}
2876
#endif
2877
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2878
{
2879
    ff_idct_xvid_mmx (block);
2880
    put_pixels_clamped_mmx(block, dest, line_size);
2881
}
2882
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2883
{
2884
    ff_idct_xvid_mmx (block);
2885
    add_pixels_clamped_mmx(block, dest, line_size);
2886
}
2887
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2888
{
2889
    ff_idct_xvid_mmx2 (block);
2890
    put_pixels_clamped_mmx(block, dest, line_size);
2891
}
2892
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2893
{
2894
    ff_idct_xvid_mmx2 (block);
2895
    add_pixels_clamped_mmx(block, dest, line_size);
2896
}
2897

    
2898
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2899
{
2900
    int i;
2901
    asm volatile("pxor %%mm7, %%mm7":);
2902
    for(i=0; i<blocksize; i+=2) {
2903
        asm volatile(
2904
            "movq    %0,    %%mm0 \n\t"
2905
            "movq    %1,    %%mm1 \n\t"
2906
            "movq    %%mm0, %%mm2 \n\t"
2907
            "movq    %%mm1, %%mm3 \n\t"
2908
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2909
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2910
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2911
            "pxor    %%mm2, %%mm1 \n\t"
2912
            "movq    %%mm3, %%mm4 \n\t"
2913
            "pand    %%mm1, %%mm3 \n\t"
2914
            "pandn   %%mm1, %%mm4 \n\t"
2915
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2916
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2917
            "movq    %%mm3, %1    \n\t"
2918
            "movq    %%mm0, %0    \n\t"
2919
            :"+m"(mag[i]), "+m"(ang[i])
2920
            ::"memory"
2921
        );
2922
    }
2923
    asm volatile("femms");
2924
}
2925
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2926
{
2927
    int i;
2928

    
2929
    asm volatile(
2930
            "movaps  %0,     %%xmm5 \n\t"
2931
        ::"m"(ff_pdw_80000000[0])
2932
    );
2933
    for(i=0; i<blocksize; i+=4) {
2934
        asm volatile(
2935
            "movaps  %0,     %%xmm0 \n\t"
2936
            "movaps  %1,     %%xmm1 \n\t"
2937
            "xorps   %%xmm2, %%xmm2 \n\t"
2938
            "xorps   %%xmm3, %%xmm3 \n\t"
2939
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2940
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2941
            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2942
            "xorps   %%xmm2, %%xmm1 \n\t"
2943
            "movaps  %%xmm3, %%xmm4 \n\t"
2944
            "andps   %%xmm1, %%xmm3 \n\t"
2945
            "andnps  %%xmm1, %%xmm4 \n\t"
2946
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2947
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2948
            "movaps  %%xmm3, %1     \n\t"
2949
            "movaps  %%xmm0, %0     \n\t"
2950
            :"+m"(mag[i]), "+m"(ang[i])
2951
            ::"memory"
2952
        );
2953
    }
2954
}
2955

    
2956
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2957
    long i = (len-4)*4;
2958
    asm volatile(
2959
        "1: \n\t"
2960
        "movq    (%1,%0), %%mm0 \n\t"
2961
        "movq   8(%1,%0), %%mm1 \n\t"
2962
        "pfmul   (%2,%0), %%mm0 \n\t"
2963
        "pfmul  8(%2,%0), %%mm1 \n\t"
2964
        "movq   %%mm0,  (%1,%0) \n\t"
2965
        "movq   %%mm1, 8(%1,%0) \n\t"
2966
        "sub  $16, %0 \n\t"
2967
        "jge 1b \n\t"
2968
        "femms  \n\t"
2969
        :"+r"(i)
2970
        :"r"(dst), "r"(src)
2971
        :"memory"
2972
    );
2973
}
2974
static void vector_fmul_sse(float *dst, const float *src, int len){
2975
    long i = (len-8)*4;
2976
    asm volatile(
2977
        "1: \n\t"
2978
        "movaps    (%1,%0), %%xmm0 \n\t"
2979
        "movaps  16(%1,%0), %%xmm1 \n\t"
2980
        "mulps     (%2,%0), %%xmm0 \n\t"
2981
        "mulps   16(%2,%0), %%xmm1 \n\t"
2982
        "movaps  %%xmm0,   (%1,%0) \n\t"
2983
        "movaps  %%xmm1, 16(%1,%0) \n\t"
2984
        "sub  $32, %0 \n\t"
2985
        "jge 1b \n\t"
2986
        :"+r"(i)
2987
        :"r"(dst), "r"(src)
2988
        :"memory"
2989
    );
2990
}
2991

    
2992
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2993
    long i = len*4-16;
2994
    asm volatile(
2995
        "1: \n\t"
2996
        "pswapd   8(%1), %%mm0 \n\t"
2997
        "pswapd    (%1), %%mm1 \n\t"
2998
        "pfmul  (%3,%0), %%mm0 \n\t"
2999
        "pfmul 8(%3,%0), %%mm1 \n\t"
3000
        "movq  %%mm0,  (%2,%0) \n\t"
3001
        "movq  %%mm1, 8(%2,%0) \n\t"
3002
        "add   $16, %1 \n\t"
3003
        "sub   $16, %0 \n\t"
3004
        "jge   1b \n\t"
3005
        :"+r"(i), "+r"(src1)
3006
        :"r"(dst), "r"(src0)
3007
    );
3008
    asm volatile("femms");
3009
}
3010
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3011
    long i = len*4-32;
3012
    asm volatile(
3013
        "1: \n\t"
3014
        "movaps        16(%1), %%xmm0 \n\t"
3015
        "movaps          (%1), %%xmm1 \n\t"
3016
        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3017
        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3018
        "mulps        (%3,%0), %%xmm0 \n\t"
3019
        "mulps      16(%3,%0), %%xmm1 \n\t"
3020
        "movaps     %%xmm0,   (%2,%0) \n\t"
3021
        "movaps     %%xmm1, 16(%2,%0) \n\t"
3022
        "add    $32, %1 \n\t"
3023
        "sub    $32, %0 \n\t"
3024
        "jge    1b \n\t"
3025
        :"+r"(i), "+r"(src1)
3026
        :"r"(dst), "r"(src0)
3027
    );
3028
}
3029

    
3030
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3031
                                      const float *src2, int src3, int len, int step){
3032
    long i = (len-4)*4;
3033
    if(step == 2 && src3 == 0){
3034
        dst += (len-4)*2;
3035
        asm volatile(
3036
            "1: \n\t"
3037
            "movq   (%2,%0),  %%mm0 \n\t"
3038
            "movq  8(%2,%0),  %%mm1 \n\t"
3039
            "pfmul  (%3,%0),  %%mm0 \n\t"
3040
            "pfmul 8(%3,%0),  %%mm1 \n\t"
3041
            "pfadd  (%4,%0),  %%mm0 \n\t"
3042
            "pfadd 8(%4,%0),  %%mm1 \n\t"
3043
            "movd     %%mm0,   (%1) \n\t"
3044
            "movd     %%mm1, 16(%1) \n\t"
3045
            "psrlq      $32,  %%mm0 \n\t"
3046
            "psrlq      $32,  %%mm1 \n\t"
3047
            "movd     %%mm0,  8(%1) \n\t"
3048
            "movd     %%mm1, 24(%1) \n\t"
3049
            "sub  $32, %1 \n\t"
3050
            "sub  $16, %0 \n\t"
3051
            "jge  1b \n\t"
3052
            :"+r"(i), "+r"(dst)
3053
            :"r"(src0), "r"(src1), "r"(src2)
3054
            :"memory"
3055
        );
3056
    }
3057
    else if(step == 1 && src3 == 0){
3058
        asm volatile(
3059
            "1: \n\t"
3060
            "movq    (%2,%0), %%mm0 \n\t"
3061
            "movq   8(%2,%0), %%mm1 \n\t"
3062
            "pfmul   (%3,%0), %%mm0 \n\t"
3063
            "pfmul  8(%3,%0), %%mm1 \n\t"
3064
            "pfadd   (%4,%0), %%mm0 \n\t"
3065
            "pfadd  8(%4,%0), %%mm1 \n\t"
3066
            "movq  %%mm0,   (%1,%0) \n\t"
3067
            "movq  %%mm1,  8(%1,%0) \n\t"
3068
            "sub  $16, %0 \n\t"
3069
            "jge  1b \n\t"
3070
            :"+r"(i)
3071
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3072
            :"memory"
3073
        );
3074
    }
3075
    else
3076
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3077
    asm volatile("femms");
3078
}
3079
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3080
                                    const float *src2, int src3, int len, int step){
3081
    long i = (len-8)*4;
3082
    if(step == 2 && src3 == 0){
3083
        dst += (len-8)*2;
3084
        asm volatile(
3085
            "1: \n\t"
3086
            "movaps   (%2,%0), %%xmm0 \n\t"
3087
            "movaps 16(%2,%0), %%xmm1 \n\t"
3088
            "mulps    (%3,%0), %%xmm0 \n\t"
3089
            "mulps  16(%3,%0), %%xmm1 \n\t"
3090
            "addps    (%4,%0), %%xmm0 \n\t"
3091
            "addps  16(%4,%0), %%xmm1 \n\t"
3092
            "movss     %%xmm0,   (%1) \n\t"
3093
            "movss     %%xmm1, 32(%1) \n\t"
3094
            "movhlps   %%xmm0, %%xmm2 \n\t"
3095
            "movhlps   %%xmm1, %%xmm3 \n\t"
3096
            "movss     %%xmm2, 16(%1) \n\t"
3097
            "movss     %%xmm3, 48(%1) \n\t"
3098
            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3099
            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3100
            "movss     %%xmm0,  8(%1) \n\t"
3101
            "movss     %%xmm1, 40(%1) \n\t"
3102
            "movhlps   %%xmm0, %%xmm2 \n\t"
3103
            "movhlps   %%xmm1, %%xmm3 \n\t"
3104
            "movss     %%xmm2, 24(%1) \n\t"
3105
            "movss     %%xmm3, 56(%1) \n\t"
3106
            "sub  $64, %1 \n\t"
3107
            "sub  $32, %0 \n\t"
3108
            "jge  1b \n\t"
3109
            :"+r"(i), "+r"(dst)
3110
            :"r"(src0), "r"(src1), "r"(src2)
3111
            :"memory"
3112
        );
3113
    }
3114
    else if(step == 1 && src3 == 0){
3115
        asm volatile(
3116
            "1: \n\t"
3117
            "movaps   (%2,%0), %%xmm0 \n\t"
3118
            "movaps 16(%2,%0), %%xmm1 \n\t"
3119
            "mulps    (%3,%0), %%xmm0 \n\t"
3120
            "mulps  16(%3,%0), %%xmm1 \n\t"
3121
            "addps    (%4,%0), %%xmm0 \n\t"
3122
            "addps  16(%4,%0), %%xmm1 \n\t"
3123
            "movaps %%xmm0,   (%1,%0) \n\t"
3124
            "movaps %%xmm1, 16(%1,%0) \n\t"
3125
            "sub  $32, %0 \n\t"
3126
            "jge  1b \n\t"
3127
            :"+r"(i)
3128
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3129
            :"memory"
3130
        );
3131
    }
3132
    else
3133
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3134
}
3135

    
3136
static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3137
    // not bit-exact: pf2id uses different rounding than C and SSE
3138
    int i;
3139
    for(i=0; i<len; i+=4) {
3140
        asm volatile(
3141
            "pf2id       %1, %%mm0 \n\t"
3142
            "pf2id       %2, %%mm1 \n\t"
3143
            "packssdw %%mm1, %%mm0 \n\t"
3144
            "movq     %%mm0, %0    \n\t"
3145
            :"=m"(dst[i])
3146
            :"m"(src[i]), "m"(src[i+2])
3147
        );
3148
    }
3149
    asm volatile("femms");
3150
}
3151
static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3152
    int i;
3153
    for(i=0; i<len; i+=4) {
3154
        asm volatile(
3155
            "cvtps2pi    %1, %%mm0 \n\t"
3156
            "cvtps2pi    %2, %%mm1 \n\t"
3157
            "packssdw %%mm1, %%mm0 \n\t"
3158
            "movq     %%mm0, %0    \n\t"
3159
            :"=m"(dst[i])
3160
            :"m"(src[i]), "m"(src[i+2])
3161
        );
3162
    }
3163
    asm volatile("emms");
3164
}
3165

    
3166
#ifdef CONFIG_SNOW_DECODER
3167
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3168
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3169
extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3170
extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3171
extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3172
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3173
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3174
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3175
#endif
3176

    
3177
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3178
{
3179
    mm_flags = mm_support();
3180

    
3181
    if (avctx->dsp_mask) {
3182
        if (avctx->dsp_mask & FF_MM_FORCE)
3183
            mm_flags |= (avctx->dsp_mask & 0xffff);
3184
        else
3185
            mm_flags &= ~(avctx->dsp_mask & 0xffff);
3186
    }
3187

    
3188
#if 0
3189
    av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3190
    if (mm_flags & MM_MMX)
3191
        av_log(avctx, AV_LOG_INFO, " mmx");
3192
    if (mm_flags & MM_MMXEXT)
3193
        av_log(avctx, AV_LOG_INFO, " mmxext");
3194
    if (mm_flags & MM_3DNOW)
3195
        av_log(avctx, AV_LOG_INFO, " 3dnow");
3196
    if (mm_flags & MM_SSE)
3197
        av_log(avctx, AV_LOG_INFO, " sse");
3198
    if (mm_flags & MM_SSE2)
3199
        av_log(avctx, AV_LOG_INFO, " sse2");
3200
    av_log(avctx, AV_LOG_INFO, "\n");
3201
#endif
3202

    
3203
    if (mm_flags & MM_MMX) {
3204
        const int idct_algo= avctx->idct_algo;
3205