Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 5cf08f23

History | View | Annotate | Download (130 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
21
 */
22

    
23
#include "../dsputil.h"
24
#include "../simple_idct.h"
25
#include "../mpegvideo.h"
26
#include "mmx.h"
27

    
28
//#undef NDEBUG
29
//#include <assert.h>
30

    
31
extern const uint8_t ff_h263_loop_filter_strength[32];
32

    
33
int mm_flags; /* multimedia extension flags */
34

    
35
/* pixel operations */
36
static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
37
static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
38
static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
39

    
40
static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
41
static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
42
static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
43
static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
44
static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
45
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
46
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
47

    
48
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
49
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
50

    
51
#define JUMPALIGN() __asm __volatile (".balign 8"::)
52
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
53

    
54
#define MOVQ_WONE(regd) \
55
    __asm __volatile ( \
56
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
57
    "psrlw $15, %%" #regd ::)
58

    
59
#define MOVQ_BFE(regd) \
60
    __asm __volatile ( \
61
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
62
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
63

    
64
#ifndef PIC
65
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
66
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
67
#else
68
// for shared library it's better to use this way for accessing constants
69
// pcmpeqd -> -1
70
#define MOVQ_BONE(regd) \
71
    __asm __volatile ( \
72
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73
    "psrlw $15, %%" #regd " \n\t" \
74
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
75

    
76
#define MOVQ_WTWO(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79
    "psrlw $15, %%" #regd " \n\t" \
80
    "psllw $1, %%" #regd " \n\t"::)
81

    
82
#endif
83

    
84
// using regr as temporary and for the output result
85
// first argument is unmodifed and second is trashed
86
// regfe is supposed to contain 0xfefefefefefefefe
87
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
88
    "movq " #rega ", " #regr "        \n\t"\
89
    "pand " #regb ", " #regr "        \n\t"\
90
    "pxor " #rega ", " #regb "        \n\t"\
91
    "pand " #regfe "," #regb "        \n\t"\
92
    "psrlq $1, " #regb "         \n\t"\
93
    "paddb " #regb ", " #regr "        \n\t"
94

    
95
#define PAVGB_MMX(rega, regb, regr, regfe) \
96
    "movq " #rega ", " #regr "        \n\t"\
97
    "por  " #regb ", " #regr "        \n\t"\
98
    "pxor " #rega ", " #regb "        \n\t"\
99
    "pand " #regfe "," #regb "        \n\t"\
100
    "psrlq $1, " #regb "        \n\t"\
101
    "psubb " #regb ", " #regr "        \n\t"
102

    
103
// mm6 is supposed to contain 0xfefefefefefefefe
104
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
105
    "movq " #rega ", " #regr "        \n\t"\
106
    "movq " #regc ", " #regp "        \n\t"\
107
    "pand " #regb ", " #regr "        \n\t"\
108
    "pand " #regd ", " #regp "        \n\t"\
109
    "pxor " #rega ", " #regb "        \n\t"\
110
    "pxor " #regc ", " #regd "        \n\t"\
111
    "pand %%mm6, " #regb "        \n\t"\
112
    "pand %%mm6, " #regd "        \n\t"\
113
    "psrlq $1, " #regb "         \n\t"\
114
    "psrlq $1, " #regd "         \n\t"\
115
    "paddb " #regb ", " #regr "        \n\t"\
116
    "paddb " #regd ", " #regp "        \n\t"
117

    
118
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
119
    "movq " #rega ", " #regr "        \n\t"\
120
    "movq " #regc ", " #regp "        \n\t"\
121
    "por  " #regb ", " #regr "        \n\t"\
122
    "por  " #regd ", " #regp "        \n\t"\
123
    "pxor " #rega ", " #regb "        \n\t"\
124
    "pxor " #regc ", " #regd "        \n\t"\
125
    "pand %%mm6, " #regb "             \n\t"\
126
    "pand %%mm6, " #regd "             \n\t"\
127
    "psrlq $1, " #regd "        \n\t"\
128
    "psrlq $1, " #regb "        \n\t"\
129
    "psubb " #regb ", " #regr "        \n\t"\
130
    "psubb " #regd ", " #regp "        \n\t"
131

    
132
/***********************************/
133
/* MMX no rounding */
134
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
135
#define SET_RND  MOVQ_WONE
136
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
137
#define PAVGB(a, b, c, e)                PAVGB_MMX_NO_RND(a, b, c, e)
138

    
139
#include "dsputil_mmx_rnd.h"
140

    
141
#undef DEF
142
#undef SET_RND
143
#undef PAVGBP
144
#undef PAVGB
145
/***********************************/
146
/* MMX rounding */
147

    
148
#define DEF(x, y) x ## _ ## y ##_mmx
149
#define SET_RND  MOVQ_WTWO
150
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
151
#define PAVGB(a, b, c, e)                PAVGB_MMX(a, b, c, e)
152

    
153
#include "dsputil_mmx_rnd.h"
154

    
155
#undef DEF
156
#undef SET_RND
157
#undef PAVGBP
158
#undef PAVGB
159

    
160
/***********************************/
161
/* 3Dnow specific */
162

    
163
#define DEF(x) x ## _3dnow
164
/* for Athlons PAVGUSB is prefered */
165
#define PAVGB "pavgusb"
166

    
167
#include "dsputil_mmx_avg.h"
168

    
169
#undef DEF
170
#undef PAVGB
171

    
172
/***********************************/
173
/* MMX2 specific */
174

    
175
#define DEF(x) x ## _mmx2
176

    
177
/* Introduced only in MMX2 set */
178
#define PAVGB "pavgb"
179

    
180
#include "dsputil_mmx_avg.h"
181

    
182
#undef DEF
183
#undef PAVGB
184

    
185
/***********************************/
186
/* standard MMX */
187

    
188
#ifdef CONFIG_ENCODERS
189
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
190
{
191
    asm volatile(
192
        "mov $-128, %%"REG_a"        \n\t"
193
        "pxor %%mm7, %%mm7        \n\t"
194
        ".balign 16                \n\t"
195
        "1:                        \n\t"
196
        "movq (%0), %%mm0        \n\t"
197
        "movq (%0, %2), %%mm2        \n\t"
198
        "movq %%mm0, %%mm1        \n\t"
199
        "movq %%mm2, %%mm3        \n\t"
200
        "punpcklbw %%mm7, %%mm0        \n\t"
201
        "punpckhbw %%mm7, %%mm1        \n\t"
202
        "punpcklbw %%mm7, %%mm2        \n\t"
203
        "punpckhbw %%mm7, %%mm3        \n\t"
204
        "movq %%mm0, (%1, %%"REG_a")\n\t"
205
        "movq %%mm1, 8(%1, %%"REG_a")\n\t"
206
        "movq %%mm2, 16(%1, %%"REG_a")\n\t"
207
        "movq %%mm3, 24(%1, %%"REG_a")\n\t"
208
        "add %3, %0                \n\t"
209
        "add $32, %%"REG_a"        \n\t"
210
        "js 1b                        \n\t"
211
        : "+r" (pixels)
212
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
213
        : "%"REG_a
214
    );
215
}
216

    
217
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
218
{
219
    asm volatile(
220
        "pxor %%mm7, %%mm7        \n\t"
221
        "mov $-128, %%"REG_a"        \n\t"
222
        ".balign 16                \n\t"
223
        "1:                        \n\t"
224
        "movq (%0), %%mm0        \n\t"
225
        "movq (%1), %%mm2        \n\t"
226
        "movq %%mm0, %%mm1        \n\t"
227
        "movq %%mm2, %%mm3        \n\t"
228
        "punpcklbw %%mm7, %%mm0        \n\t"
229
        "punpckhbw %%mm7, %%mm1        \n\t"
230
        "punpcklbw %%mm7, %%mm2        \n\t"
231
        "punpckhbw %%mm7, %%mm3        \n\t"
232
        "psubw %%mm2, %%mm0        \n\t"
233
        "psubw %%mm3, %%mm1        \n\t"
234
        "movq %%mm0, (%2, %%"REG_a")\n\t"
235
        "movq %%mm1, 8(%2, %%"REG_a")\n\t"
236
        "add %3, %0                \n\t"
237
        "add %3, %1                \n\t"
238
        "add $16, %%"REG_a"        \n\t"
239
        "jnz 1b                        \n\t"
240
        : "+r" (s1), "+r" (s2)
241
        : "r" (block+64), "r" ((long)stride)
242
        : "%"REG_a
243
    );
244
}
245
#endif //CONFIG_ENCODERS
246

    
247
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
248
{
249
    const DCTELEM *p;
250
    uint8_t *pix;
251

    
252
    /* read the pixels */
253
    p = block;
254
    pix = pixels;
255
    /* unrolled loop */
256
        __asm __volatile(
257
                "movq        %3, %%mm0\n\t"
258
                "movq        8%3, %%mm1\n\t"
259
                "movq        16%3, %%mm2\n\t"
260
                "movq        24%3, %%mm3\n\t"
261
                "movq        32%3, %%mm4\n\t"
262
                "movq        40%3, %%mm5\n\t"
263
                "movq        48%3, %%mm6\n\t"
264
                "movq        56%3, %%mm7\n\t"
265
                "packuswb %%mm1, %%mm0\n\t"
266
                "packuswb %%mm3, %%mm2\n\t"
267
                "packuswb %%mm5, %%mm4\n\t"
268
                "packuswb %%mm7, %%mm6\n\t"
269
                "movq        %%mm0, (%0)\n\t"
270
                "movq        %%mm2, (%0, %1)\n\t"
271
                "movq        %%mm4, (%0, %1, 2)\n\t"
272
                "movq        %%mm6, (%0, %2)\n\t"
273
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
274
                :"memory");
275
        pix += line_size*4;
276
        p += 32;
277

    
278
    // if here would be an exact copy of the code above
279
    // compiler would generate some very strange code
280
    // thus using "r"
281
    __asm __volatile(
282
            "movq        (%3), %%mm0\n\t"
283
            "movq        8(%3), %%mm1\n\t"
284
            "movq        16(%3), %%mm2\n\t"
285
            "movq        24(%3), %%mm3\n\t"
286
            "movq        32(%3), %%mm4\n\t"
287
            "movq        40(%3), %%mm5\n\t"
288
            "movq        48(%3), %%mm6\n\t"
289
            "movq        56(%3), %%mm7\n\t"
290
            "packuswb %%mm1, %%mm0\n\t"
291
            "packuswb %%mm3, %%mm2\n\t"
292
            "packuswb %%mm5, %%mm4\n\t"
293
            "packuswb %%mm7, %%mm6\n\t"
294
            "movq        %%mm0, (%0)\n\t"
295
            "movq        %%mm2, (%0, %1)\n\t"
296
            "movq        %%mm4, (%0, %1, 2)\n\t"
297
            "movq        %%mm6, (%0, %2)\n\t"
298
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
299
            :"memory");
300
}
301

    
302
static unsigned char __align8 vector128[8] =
303
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
304

    
305
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
306
{
307
    int i;
308

    
309
    movq_m2r(*vector128, mm1);
310
    for (i = 0; i < 8; i++) {
311
        movq_m2r(*(block), mm0);
312
        packsswb_m2r(*(block + 4), mm0);
313
        block += 8;
314
        paddb_r2r(mm1, mm0);
315
        movq_r2m(mm0, *pixels);
316
        pixels += line_size;
317
    }
318
}
319

    
320
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
321
{
322
    const DCTELEM *p;
323
    uint8_t *pix;
324
    int i;
325

    
326
    /* read the pixels */
327
    p = block;
328
    pix = pixels;
329
    MOVQ_ZERO(mm7);
330
    i = 4;
331
    do {
332
        __asm __volatile(
333
                "movq        (%2), %%mm0\n\t"
334
                "movq        8(%2), %%mm1\n\t"
335
                "movq        16(%2), %%mm2\n\t"
336
                "movq        24(%2), %%mm3\n\t"
337
                "movq        %0, %%mm4\n\t"
338
                "movq        %1, %%mm6\n\t"
339
                "movq        %%mm4, %%mm5\n\t"
340
                "punpcklbw %%mm7, %%mm4\n\t"
341
                "punpckhbw %%mm7, %%mm5\n\t"
342
                "paddsw        %%mm4, %%mm0\n\t"
343
                "paddsw        %%mm5, %%mm1\n\t"
344
                "movq        %%mm6, %%mm5\n\t"
345
                "punpcklbw %%mm7, %%mm6\n\t"
346
                "punpckhbw %%mm7, %%mm5\n\t"
347
                "paddsw        %%mm6, %%mm2\n\t"
348
                "paddsw        %%mm5, %%mm3\n\t"
349
                "packuswb %%mm1, %%mm0\n\t"
350
                "packuswb %%mm3, %%mm2\n\t"
351
                "movq        %%mm0, %0\n\t"
352
                "movq        %%mm2, %1\n\t"
353
                :"+m"(*pix), "+m"(*(pix+line_size))
354
                :"r"(p)
355
                :"memory");
356
        pix += line_size*2;
357
        p += 16;
358
    } while (--i);
359
}
360

    
361
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
362
{
363
    __asm __volatile(
364
         "lea (%3, %3), %%"REG_a"        \n\t"
365
         ".balign 8                        \n\t"
366
         "1:                                \n\t"
367
         "movd (%1), %%mm0                \n\t"
368
         "movd (%1, %3), %%mm1                \n\t"
369
         "movd %%mm0, (%2)                \n\t"
370
         "movd %%mm1, (%2, %3)                \n\t"
371
         "add %%"REG_a", %1                \n\t"
372
         "add %%"REG_a", %2                \n\t"
373
         "movd (%1), %%mm0                \n\t"
374
         "movd (%1, %3), %%mm1                \n\t"
375
         "movd %%mm0, (%2)                \n\t"
376
         "movd %%mm1, (%2, %3)                \n\t"
377
         "add %%"REG_a", %1                \n\t"
378
         "add %%"REG_a", %2                \n\t"
379
         "subl $4, %0                        \n\t"
380
         "jnz 1b                        \n\t"
381
         : "+g"(h), "+r" (pixels),  "+r" (block)
382
         : "r"((long)line_size)
383
         : "%"REG_a, "memory"
384
        );
385
}
386

    
387
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
388
{
389
    __asm __volatile(
390
         "lea (%3, %3), %%"REG_a"        \n\t"
391
         ".balign 8                        \n\t"
392
         "1:                                \n\t"
393
         "movq (%1), %%mm0                \n\t"
394
         "movq (%1, %3), %%mm1                \n\t"
395
              "movq %%mm0, (%2)                \n\t"
396
         "movq %%mm1, (%2, %3)                \n\t"
397
         "add %%"REG_a", %1                \n\t"
398
         "add %%"REG_a", %2                \n\t"
399
         "movq (%1), %%mm0                \n\t"
400
         "movq (%1, %3), %%mm1                \n\t"
401
         "movq %%mm0, (%2)                \n\t"
402
         "movq %%mm1, (%2, %3)                \n\t"
403
         "add %%"REG_a", %1                \n\t"
404
         "add %%"REG_a", %2                \n\t"
405
         "subl $4, %0                        \n\t"
406
         "jnz 1b                        \n\t"
407
         : "+g"(h), "+r" (pixels),  "+r" (block)
408
         : "r"((long)line_size)
409
         : "%"REG_a, "memory"
410
        );
411
}
412

    
413
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
414
{
415
    __asm __volatile(
416
         "lea (%3, %3), %%"REG_a"        \n\t"
417
         ".balign 8                        \n\t"
418
         "1:                                \n\t"
419
         "movq (%1), %%mm0                \n\t"
420
         "movq 8(%1), %%mm4                \n\t"
421
         "movq (%1, %3), %%mm1                \n\t"
422
         "movq 8(%1, %3), %%mm5                \n\t"
423
              "movq %%mm0, (%2)                \n\t"
424
              "movq %%mm4, 8(%2)                \n\t"
425
         "movq %%mm1, (%2, %3)                \n\t"
426
         "movq %%mm5, 8(%2, %3)                \n\t"
427
         "add %%"REG_a", %1                \n\t"
428
         "add %%"REG_a", %2               \n\t"
429
         "movq (%1), %%mm0                \n\t"
430
         "movq 8(%1), %%mm4                \n\t"
431
         "movq (%1, %3), %%mm1                \n\t"
432
         "movq 8(%1, %3), %%mm5                \n\t"
433
         "movq %%mm0, (%2)                \n\t"
434
         "movq %%mm4, 8(%2)                \n\t"
435
         "movq %%mm1, (%2, %3)                \n\t"
436
         "movq %%mm5, 8(%2, %3)                \n\t"
437
         "add %%"REG_a", %1                \n\t"
438
         "add %%"REG_a", %2               \n\t"
439
         "subl $4, %0                        \n\t"
440
         "jnz 1b                        \n\t"
441
         : "+g"(h), "+r" (pixels),  "+r" (block)
442
         : "r"((long)line_size)
443
         : "%"REG_a, "memory"
444
        );
445
}
446

    
447
static void clear_blocks_mmx(DCTELEM *blocks)
448
{
449
    __asm __volatile(
450
                "pxor %%mm7, %%mm7                \n\t"
451
                "mov $-128*6, %%"REG_a"        \n\t"
452
                "1:                                \n\t"
453
                "movq %%mm7, (%0, %%"REG_a")        \n\t"
454
                "movq %%mm7, 8(%0, %%"REG_a")        \n\t"
455
                "movq %%mm7, 16(%0, %%"REG_a")        \n\t"
456
                "movq %%mm7, 24(%0, %%"REG_a")        \n\t"
457
                "add $32, %%"REG_a"                \n\t"
458
                " js 1b                                \n\t"
459
                : : "r" (((uint8_t *)blocks)+128*6)
460
                : "%"REG_a
461
        );
462
}
463

    
464
#ifdef CONFIG_ENCODERS
465
static int pix_sum16_mmx(uint8_t * pix, int line_size){
466
    const int h=16;
467
    int sum;
468
    long index= -line_size*h;
469

    
470
    __asm __volatile(
471
                "pxor %%mm7, %%mm7                \n\t"
472
                "pxor %%mm6, %%mm6                \n\t"
473
                "1:                                \n\t"
474
                "movq (%2, %1), %%mm0                \n\t"
475
                "movq (%2, %1), %%mm1                \n\t"
476
                "movq 8(%2, %1), %%mm2                \n\t"
477
                "movq 8(%2, %1), %%mm3                \n\t"
478
                "punpcklbw %%mm7, %%mm0                \n\t"
479
                "punpckhbw %%mm7, %%mm1                \n\t"
480
                "punpcklbw %%mm7, %%mm2                \n\t"
481
                "punpckhbw %%mm7, %%mm3                \n\t"
482
                "paddw %%mm0, %%mm1                \n\t"
483
                "paddw %%mm2, %%mm3                \n\t"
484
                "paddw %%mm1, %%mm3                \n\t"
485
                "paddw %%mm3, %%mm6                \n\t"
486
                "add %3, %1                        \n\t"
487
                " js 1b                                \n\t"
488
                "movq %%mm6, %%mm5                \n\t"
489
                "psrlq $32, %%mm6                \n\t"
490
                "paddw %%mm5, %%mm6                \n\t"
491
                "movq %%mm6, %%mm5                \n\t"
492
                "psrlq $16, %%mm6                \n\t"
493
                "paddw %%mm5, %%mm6                \n\t"
494
                "movd %%mm6, %0                        \n\t"
495
                "andl $0xFFFF, %0                \n\t"
496
                : "=&r" (sum), "+r" (index)
497
                : "r" (pix - index), "r" ((long)line_size)
498
        );
499

    
500
        return sum;
501
}
502
#endif //CONFIG_ENCODERS
503

    
504
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
505
    long i=0;
506
    asm volatile(
507
        "1:                                \n\t"
508
        "movq  (%1, %0), %%mm0                \n\t"
509
        "movq  (%2, %0), %%mm1                \n\t"
510
        "paddb %%mm0, %%mm1                \n\t"
511
        "movq %%mm1, (%2, %0)                \n\t"
512
        "movq 8(%1, %0), %%mm0                \n\t"
513
        "movq 8(%2, %0), %%mm1                \n\t"
514
        "paddb %%mm0, %%mm1                \n\t"
515
        "movq %%mm1, 8(%2, %0)                \n\t"
516
        "add $16, %0                        \n\t"
517
        "cmp %3, %0                        \n\t"
518
        " jb 1b                                \n\t"
519
        : "+r" (i)
520
        : "r"(src), "r"(dst), "r"((long)w-15)
521
    );
522
    for(; i<w; i++)
523
        dst[i+0] += src[i+0];
524
}
525

    
526
#define H263_LOOP_FILTER \
527
        "pxor %%mm7, %%mm7                \n\t"\
528
        "movq  %0, %%mm0                \n\t"\
529
        "movq  %0, %%mm1                \n\t"\
530
        "movq  %3, %%mm2                \n\t"\
531
        "movq  %3, %%mm3                \n\t"\
532
        "punpcklbw %%mm7, %%mm0                \n\t"\
533
        "punpckhbw %%mm7, %%mm1                \n\t"\
534
        "punpcklbw %%mm7, %%mm2                \n\t"\
535
        "punpckhbw %%mm7, %%mm3                \n\t"\
536
        "psubw %%mm2, %%mm0                \n\t"\
537
        "psubw %%mm3, %%mm1                \n\t"\
538
        "movq  %1, %%mm2                \n\t"\
539
        "movq  %1, %%mm3                \n\t"\
540
        "movq  %2, %%mm4                \n\t"\
541
        "movq  %2, %%mm5                \n\t"\
542
        "punpcklbw %%mm7, %%mm2                \n\t"\
543
        "punpckhbw %%mm7, %%mm3                \n\t"\
544
        "punpcklbw %%mm7, %%mm4                \n\t"\
545
        "punpckhbw %%mm7, %%mm5                \n\t"\
546
        "psubw %%mm2, %%mm4                \n\t"\
547
        "psubw %%mm3, %%mm5                \n\t"\
548
        "psllw $2, %%mm4                \n\t"\
549
        "psllw $2, %%mm5                \n\t"\
550
        "paddw %%mm0, %%mm4                \n\t"\
551
        "paddw %%mm1, %%mm5                \n\t"\
552
        "pxor %%mm6, %%mm6                \n\t"\
553
        "pcmpgtw %%mm4, %%mm6                \n\t"\
554
        "pcmpgtw %%mm5, %%mm7                \n\t"\
555
        "pxor %%mm6, %%mm4                \n\t"\
556
        "pxor %%mm7, %%mm5                \n\t"\
557
        "psubw %%mm6, %%mm4                \n\t"\
558
        "psubw %%mm7, %%mm5                \n\t"\
559
        "psrlw $3, %%mm4                \n\t"\
560
        "psrlw $3, %%mm5                \n\t"\
561
        "packuswb %%mm5, %%mm4                \n\t"\
562
        "packsswb %%mm7, %%mm6                \n\t"\
563
        "pxor %%mm7, %%mm7                \n\t"\
564
        "movd %4, %%mm2                        \n\t"\
565
        "punpcklbw %%mm2, %%mm2                \n\t"\
566
        "punpcklbw %%mm2, %%mm2                \n\t"\
567
        "punpcklbw %%mm2, %%mm2                \n\t"\
568
        "psubusb %%mm4, %%mm2                \n\t"\
569
        "movq %%mm2, %%mm3                \n\t"\
570
        "psubusb %%mm4, %%mm3                \n\t"\
571
        "psubb %%mm3, %%mm2                \n\t"\
572
        "movq %1, %%mm3                        \n\t"\
573
        "movq %2, %%mm4                        \n\t"\
574
        "pxor %%mm6, %%mm3                \n\t"\
575
        "pxor %%mm6, %%mm4                \n\t"\
576
        "paddusb %%mm2, %%mm3                \n\t"\
577
        "psubusb %%mm2, %%mm4                \n\t"\
578
        "pxor %%mm6, %%mm3                \n\t"\
579
        "pxor %%mm6, %%mm4                \n\t"\
580
        "paddusb %%mm2, %%mm2                \n\t"\
581
        "packsswb %%mm1, %%mm0                \n\t"\
582
        "pcmpgtb %%mm0, %%mm7                \n\t"\
583
        "pxor %%mm7, %%mm0                \n\t"\
584
        "psubb %%mm7, %%mm0                \n\t"\
585
        "movq %%mm0, %%mm1                \n\t"\
586
        "psubusb %%mm2, %%mm0                \n\t"\
587
        "psubb %%mm0, %%mm1                \n\t"\
588
        "pand %5, %%mm1                        \n\t"\
589
        "psrlw $2, %%mm1                \n\t"\
590
        "pxor %%mm7, %%mm1                \n\t"\
591
        "psubb %%mm7, %%mm1                \n\t"\
592
        "movq %0, %%mm5                        \n\t"\
593
        "movq %3, %%mm6                        \n\t"\
594
        "psubb %%mm1, %%mm5                \n\t"\
595
        "paddb %%mm1, %%mm6                \n\t"
596

    
597
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
598
    const int strength= ff_h263_loop_filter_strength[qscale];
599

    
600
    asm volatile(
601
    
602
        H263_LOOP_FILTER
603
        
604
        "movq %%mm3, %1                        \n\t"
605
        "movq %%mm4, %2                        \n\t"
606
        "movq %%mm5, %0                        \n\t"
607
        "movq %%mm6, %3                        \n\t"
608
        : "+m" (*(uint64_t*)(src - 2*stride)),
609
          "+m" (*(uint64_t*)(src - 1*stride)),
610
          "+m" (*(uint64_t*)(src + 0*stride)),
611
          "+m" (*(uint64_t*)(src + 1*stride))
612
        : "g" (2*strength), "m"(ff_pb_FC)
613
    );
614
}
615

    
616
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
617
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
618
        "movd  %4, %%mm0                \n\t"
619
        "movd  %5, %%mm1                \n\t"
620
        "movd  %6, %%mm2                \n\t"
621
        "movd  %7, %%mm3                \n\t"
622
        "punpcklbw %%mm1, %%mm0                \n\t"
623
        "punpcklbw %%mm3, %%mm2                \n\t"
624
        "movq %%mm0, %%mm1                \n\t"
625
        "punpcklwd %%mm2, %%mm0                \n\t"
626
        "punpckhwd %%mm2, %%mm1                \n\t"
627
        "movd  %%mm0, %0                \n\t"
628
        "punpckhdq %%mm0, %%mm0                \n\t"
629
        "movd  %%mm0, %1                \n\t"
630
        "movd  %%mm1, %2                \n\t"
631
        "punpckhdq %%mm1, %%mm1                \n\t"
632
        "movd  %%mm1, %3                \n\t"
633
        
634
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
635
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
636
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
637
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
638
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
639
           "m" (*(uint32_t*)(src + 1*src_stride)),
640
           "m" (*(uint32_t*)(src + 2*src_stride)),
641
           "m" (*(uint32_t*)(src + 3*src_stride))
642
    );
643
}
644

    
645
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
646
    const int strength= ff_h263_loop_filter_strength[qscale];
647
    uint64_t temp[4] __attribute__ ((aligned(8)));
648
    uint8_t *btemp= (uint8_t*)temp;
649
    
650
    src -= 2;
651

    
652
    transpose4x4(btemp  , src           , 8, stride);
653
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
654
    asm volatile(
655
        H263_LOOP_FILTER // 5 3 4 6
656
        
657
        : "+m" (temp[0]),
658
          "+m" (temp[1]),
659
          "+m" (temp[2]),
660
          "+m" (temp[3])
661
        : "g" (2*strength), "m"(ff_pb_FC)
662
    );
663

    
664
    asm volatile(
665
        "movq %%mm5, %%mm1                \n\t"
666
        "movq %%mm4, %%mm0                \n\t"
667
        "punpcklbw %%mm3, %%mm5                \n\t"
668
        "punpcklbw %%mm6, %%mm4                \n\t"
669
        "punpckhbw %%mm3, %%mm1                \n\t"
670
        "punpckhbw %%mm6, %%mm0                \n\t"
671
        "movq %%mm5, %%mm3                \n\t"
672
        "movq %%mm1, %%mm6                \n\t"
673
        "punpcklwd %%mm4, %%mm5                \n\t"
674
        "punpcklwd %%mm0, %%mm1                \n\t"
675
        "punpckhwd %%mm4, %%mm3                \n\t"
676
        "punpckhwd %%mm0, %%mm6                \n\t"
677
        "movd %%mm5, (%0)                \n\t"
678
        "punpckhdq %%mm5, %%mm5                \n\t"
679
        "movd %%mm5, (%0,%2)                \n\t"
680
        "movd %%mm3, (%0,%2,2)                \n\t"
681
        "punpckhdq %%mm3, %%mm3                \n\t"
682
        "movd %%mm3, (%0,%3)                \n\t"
683
        "movd %%mm1, (%1)                \n\t"
684
        "punpckhdq %%mm1, %%mm1                \n\t"
685
        "movd %%mm1, (%1,%2)                \n\t"
686
        "movd %%mm6, (%1,%2,2)                \n\t"
687
        "punpckhdq %%mm6, %%mm6                \n\t"
688
        "movd %%mm6, (%1,%3)                \n\t"
689
        :: "r" (src),
690
           "r" (src + 4*stride),
691
           "r" ((long)   stride ),
692
           "r" ((long)(3*stride))
693
    );
694
}
695

    
696

    
697
// out: o = |x-y|>a
698
// clobbers: t
699
#define DIFF_GT_MMX(x,y,a,o,t)\
700
    "movq     "#y", "#t"  \n\t"\
701
    "movq     "#x", "#o"  \n\t"\
702
    "psubusb  "#x", "#t"  \n\t"\
703
    "psubusb  "#y", "#o"  \n\t"\
704
    "por      "#t", "#o"  \n\t"\
705
    "psubusb  "#a", "#o"  \n\t"
706

    
707
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
708
// out: mm5=beta-1, mm7=mask
709
// clobbers: mm4,mm6
710
#define H264_DEBLOCK_MASK(alpha1, beta1) \
711
    "pshufw $0, "#alpha1", %%mm4 \n\t"\
712
    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
713
    "packuswb  %%mm4, %%mm4      \n\t"\
714
    "packuswb  %%mm5, %%mm5      \n\t"\
715
    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
716
    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
717
    "por       %%mm4, %%mm7      \n\t"\
718
    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
719
    "por       %%mm4, %%mm7      \n\t"\
720
    "pxor      %%mm6, %%mm6      \n\t"\
721
    "pcmpeqb   %%mm6, %%mm7      \n\t"
722

    
723
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
724
// out: mm1=p0' mm2=q0'
725
// clobbers: mm0,3-6
726
#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
727
        /* a = q0^p0^((p1-q1)>>2) */\
728
        "movq    %%mm0, %%mm4  \n\t"\
729
        "psubb   %%mm3, %%mm4  \n\t"\
730
        "psrlw   $2,    %%mm4  \n\t"\
731
        "pxor    %%mm1, %%mm4  \n\t"\
732
        "pxor    %%mm2, %%mm4  \n\t"\
733
        /* b = p0^(q1>>2) */\
734
        "psrlw   $2,    %%mm3  \n\t"\
735
        "pand "#pb_3f", %%mm3  \n\t"\
736
        "movq    %%mm1, %%mm5  \n\t"\
737
        "pxor    %%mm3, %%mm5  \n\t"\
738
        /* c = q0^(p1>>2) */\
739
        "psrlw   $2,    %%mm0  \n\t"\
740
        "pand "#pb_3f", %%mm0  \n\t"\
741
        "movq    %%mm2, %%mm6  \n\t"\
742
        "pxor    %%mm0, %%mm6  \n\t"\
743
        /* d = (c^b) & ~(b^a) & 1 */\
744
        "pxor    %%mm5, %%mm6  \n\t"\
745
        "pxor    %%mm4, %%mm5  \n\t"\
746
        "pandn   %%mm6, %%mm5  \n\t"\
747
        "pand "#pb_01", %%mm5  \n\t"\
748
        /* delta = (avg(q0, p1>>2) + (d&a))
749
         *       - (avg(p0, q1>>2) + (d&~a)) */\
750
        "pavgb   %%mm2, %%mm0  \n\t"\
751
        "movq    %%mm5, %%mm6  \n\t"\
752
        "pand    %%mm4, %%mm6  \n\t"\
753
        "paddusb %%mm6, %%mm0  \n\t"\
754
        "pavgb   %%mm1, %%mm3  \n\t"\
755
        "pandn   %%mm5, %%mm4  \n\t"\
756
        "paddusb %%mm4, %%mm3  \n\t"\
757
        /* p0 += clip(delta, -tc0, tc0)
758
         * q0 -= clip(delta, -tc0, tc0) */\
759
        "movq    %%mm0, %%mm4  \n\t"\
760
        "psubusb %%mm3, %%mm0  \n\t"\
761
        "psubusb %%mm4, %%mm3  \n\t"\
762
        "pminub  %%mm7, %%mm0  \n\t"\
763
        "pminub  %%mm7, %%mm3  \n\t"\
764
        "paddusb %%mm0, %%mm1  \n\t"\
765
        "paddusb %%mm3, %%mm2  \n\t"\
766
        "psubusb %%mm3, %%mm1  \n\t"\
767
        "psubusb %%mm0, %%mm2  \n\t"
768

    
769
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=mm_bone
770
// out: (q1addr) = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
771
// clobbers: q2, tmp, tc0
772
#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
773
        "movq     %%mm1,  "#tmp"   \n\t"\
774
        "pavgb    %%mm2,  "#tmp"   \n\t"\
775
        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
776
        "pxor   "q2addr", "#tmp"   \n\t"\
777
        "pand     %8,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
778
        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
779
        "movq     "#p1",  "#tmp"   \n\t"\
780
        "psubusb  "#tc0", "#tmp"   \n\t"\
781
        "paddusb  "#p1",  "#tc0"   \n\t"\
782
        "pmaxub   "#tmp", "#q2"    \n\t"\
783
        "pminub   "#tc0", "#q2"    \n\t"\
784
        "movq     "#q2",  "q1addr" \n\t"
785

    
786
static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
787
{
788
    uint64_t tmp0;
789
    uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101;
790
    // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask
791
    uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff };
792

    
793
    asm volatile(
794
        "movq    (%1,%3), %%mm0    \n\t" //p1
795
        "movq    (%1,%3,2), %%mm1  \n\t" //p0
796
        "movq    (%2),    %%mm2    \n\t" //q0
797
        "movq    (%2,%3), %%mm3    \n\t" //q1
798
        H264_DEBLOCK_MASK(%6, %7)
799
        "pand     %5,     %%mm7    \n\t"
800
        "movq     %%mm7,  %0       \n\t"
801

    
802
        /* filter p1 */
803
        "movq     (%1),   %%mm3    \n\t" //p2
804
        DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
805
        "pandn    %%mm7,  %%mm6    \n\t"
806
        "pcmpeqb  %%mm7,  %%mm6    \n\t"
807
        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
808
        "pshufw  $80, %4, %%mm4    \n\t"
809
        "pand     %%mm7,  %%mm4    \n\t" // mask & tc0
810
        "movq     %8,     %%mm7    \n\t"
811
        "pand     %%mm6,  %%mm7    \n\t" // mask & |p2-p0|<beta & 1
812
        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
813
        "paddb    %%mm4,  %%mm7    \n\t" // tc++
814
        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
815

    
816
        /* filter q1 */
817
        "movq    (%2,%3,2), %%mm4  \n\t" //q2
818
        DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
819
        "pandn    %0,     %%mm6    \n\t"
820
        "pcmpeqb  %0,     %%mm6    \n\t"
821
        "pand     %0,     %%mm6    \n\t"
822
        "pshufw  $80, %4, %%mm5    \n\t"
823
        "pand     %%mm6,  %%mm5    \n\t"
824
        "pand     %8,     %%mm6    \n\t"
825
        "paddb    %%mm6,  %%mm7    \n\t"
826
        "movq    (%2,%3), %%mm3    \n\t"
827
        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
828

    
829
        /* filter p0, q0 */
830
        H264_DEBLOCK_P0_Q0(%8, %9)
831
        "movq      %%mm1, (%1,%3,2) \n\t"
832
        "movq      %%mm2, (%2)      \n\t"
833

    
834
        : "=m"(tmp0)
835
        : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
836
          "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1),
837
          "m"(mm_bone), "m"(ff_pb_3F)
838
    );
839
}
840

    
841
static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
842
{
843
    if((tc0[0] & tc0[1]) >= 0)
844
        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
845
    if((tc0[2] & tc0[3]) >= 0)
846
        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
847
}
848
static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
849
{
850
    //FIXME: could cut some load/stores by merging transpose with filter
851
    // also, it only needs to transpose 6x8
852
    uint8_t trans[8*8];
853
    int i;
854
    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
855
        if((tc0[0] & tc0[1]) < 0)
856
            continue;
857
        transpose4x4(trans,       pix-4,          8, stride);
858
        transpose4x4(trans  +4*8, pix,            8, stride);
859
        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
860
        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
861
        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
862
        transpose4x4(pix-2,          trans  +2*8, stride, 8);
863
        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
864
    }
865
}
866

    
867
static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
868
{
869
    asm volatile(
870
        "movq    (%0),    %%mm0     \n\t" //p1
871
        "movq    (%0,%2), %%mm1     \n\t" //p0
872
        "movq    (%1),    %%mm2     \n\t" //q0
873
        "movq    (%1,%2), %%mm3     \n\t" //q1
874
        H264_DEBLOCK_MASK(%4, %5)
875
        "movd      %3,    %%mm6     \n\t"
876
        "punpcklbw %%mm6, %%mm6     \n\t"
877
        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
878
        H264_DEBLOCK_P0_Q0(%6, %7)
879
        "movq      %%mm1, (%0,%2)   \n\t"
880
        "movq      %%mm2, (%1)      \n\t"
881

    
882
        :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
883
           "r"(*(uint32_t*)tc0),
884
           "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F)
885
    );
886
}
887

    
888
static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
889
{
890
    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
891
}
892

    
893
static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
894
{
895
    //FIXME: could cut some load/stores by merging transpose with filter
896
    uint8_t trans[8*4];
897
    transpose4x4(trans, pix-2, 8, stride);
898
    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
899
    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
900
    transpose4x4(pix-2, trans, stride, 8);
901
    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
902
}
903

    
904
// p0 = (p0 + q1 + 2*p1 + 2) >> 2
905
#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
906
    "movq    "#p0", %%mm4  \n\t"\
907
    "pxor    "#q1", %%mm4  \n\t"\
908
    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
909
    "pavgb   "#q1", "#p0"  \n\t"\
910
    "psubusb %%mm4, "#p0"  \n\t"\
911
    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
912

    
913
static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
914
{
915
    asm volatile(
916
        "movq    (%0),    %%mm0     \n\t"
917
        "movq    (%0,%2), %%mm1     \n\t"
918
        "movq    (%1),    %%mm2     \n\t"
919
        "movq    (%1,%2), %%mm3     \n\t"
920
        H264_DEBLOCK_MASK(%3, %4)
921
        "movq    %%mm1,   %%mm5     \n\t"
922
        "movq    %%mm2,   %%mm6     \n\t"
923
        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
924
        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
925
        "psubb   %%mm5,   %%mm1     \n\t"
926
        "psubb   %%mm6,   %%mm2     \n\t"
927
        "pand    %%mm7,   %%mm1     \n\t"
928
        "pand    %%mm7,   %%mm2     \n\t"
929
        "paddb   %%mm5,   %%mm1     \n\t"
930
        "paddb   %%mm6,   %%mm2     \n\t"
931
        "movq    %%mm1,   (%0,%2)   \n\t"
932
        "movq    %%mm2,   (%1)      \n\t"
933
        :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
934
           "m"(alpha1), "m"(beta1), "m"(mm_bone)
935
    );
936
}
937

    
938
static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
939
{
940
    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
941
}
942

    
943
static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
944
{
945
    //FIXME: could cut some load/stores by merging transpose with filter
946
    uint8_t trans[8*4];
947
    transpose4x4(trans, pix-2, 8, stride);
948
    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
949
    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
950
    transpose4x4(pix-2, trans, stride, 8);
951
    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
952
}
953

    
954

    
955
#ifdef CONFIG_ENCODERS
956
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
957
    int tmp;
958
  asm volatile (
959
      "movl $16,%%ecx\n"
960
      "pxor %%mm0,%%mm0\n"
961
      "pxor %%mm7,%%mm7\n"
962
      "1:\n"
963
      "movq (%0),%%mm2\n"        /* mm2 = pix[0-7] */
964
      "movq 8(%0),%%mm3\n"        /* mm3 = pix[8-15] */
965

    
966
      "movq %%mm2,%%mm1\n"        /* mm1 = mm2 = pix[0-7] */
967

    
968
      "punpckhbw %%mm0,%%mm1\n"        /* mm1 = [pix4-7] */
969
      "punpcklbw %%mm0,%%mm2\n"        /* mm2 = [pix0-3] */
970

    
971
      "movq %%mm3,%%mm4\n"        /* mm4 = mm3 = pix[8-15] */
972
      "punpckhbw %%mm0,%%mm3\n"        /* mm3 = [pix12-15] */
973
      "punpcklbw %%mm0,%%mm4\n"        /* mm4 = [pix8-11] */
974

    
975
      "pmaddwd %%mm1,%%mm1\n"        /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
976
      "pmaddwd %%mm2,%%mm2\n"        /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
977

    
978
      "pmaddwd %%mm3,%%mm3\n"
979
      "pmaddwd %%mm4,%%mm4\n"
980

    
981
      "paddd %%mm1,%%mm2\n"        /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
982
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
983
      "paddd %%mm3,%%mm4\n"
984
      "paddd %%mm2,%%mm7\n"
985

    
986
      "add %2, %0\n"
987
      "paddd %%mm4,%%mm7\n"
988
      "dec %%ecx\n"
989
      "jnz 1b\n"
990

    
991
      "movq %%mm7,%%mm1\n"
992
      "psrlq $32, %%mm7\n"        /* shift hi dword to lo */
993
      "paddd %%mm7,%%mm1\n"
994
      "movd %%mm1,%1\n"
995
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
996
    return tmp;
997
}
998

    
999
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1000
    int tmp;
1001
  asm volatile (
1002
      "movl %4,%%ecx\n"
1003
      "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
1004
      "pxor %%mm7,%%mm7\n"        /* mm7 holds the sum */
1005
      "1:\n"
1006
      "movq (%0),%%mm1\n"        /* mm1 = pix1[0-7] */
1007
      "movq (%1),%%mm2\n"        /* mm2 = pix2[0-7] */
1008

    
1009
      "movq %%mm1,%%mm5\n"
1010
      "psubusb %%mm2,%%mm1\n"
1011
      "psubusb %%mm5,%%mm2\n"
1012

    
1013
      "por %%mm1,%%mm2\n"
1014

    
1015
      "movq %%mm2,%%mm1\n"
1016

    
1017
      "punpckhbw %%mm0,%%mm2\n"
1018
      "punpcklbw %%mm0,%%mm1\n"        /* mm1 now spread over (mm1,mm2) */
1019

    
1020
      "pmaddwd %%mm2,%%mm2\n"
1021
      "pmaddwd %%mm1,%%mm1\n"
1022

    
1023
      "add %3,%0\n"
1024
      "add %3,%1\n"
1025

    
1026
      "paddd %%mm2,%%mm1\n"
1027
      "paddd %%mm1,%%mm7\n"
1028

    
1029
      "decl %%ecx\n"
1030
      "jnz 1b\n"
1031

    
1032
      "movq %%mm7,%%mm1\n"
1033
      "psrlq $32, %%mm7\n"        /* shift hi dword to lo */
1034
      "paddd %%mm7,%%mm1\n"
1035
      "movd %%mm1,%2\n"
1036
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
1037
      : "r" ((long)line_size) , "m" (h)
1038
      : "%ecx");
1039
    return tmp;
1040
}
1041

    
1042
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1043
    int tmp;
1044
  asm volatile (
1045
      "movl %4,%%ecx\n"
1046
      "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
1047
      "pxor %%mm7,%%mm7\n"        /* mm7 holds the sum */
1048
      "1:\n"
1049
      "movq (%0),%%mm1\n"        /* mm1 = pix1[0-7] */
1050
      "movq (%1),%%mm2\n"        /* mm2 = pix2[0-7] */
1051
      "movq 8(%0),%%mm3\n"        /* mm3 = pix1[8-15] */
1052
      "movq 8(%1),%%mm4\n"        /* mm4 = pix2[8-15] */
1053

    
1054
      /* todo: mm1-mm2, mm3-mm4 */
1055
      /* algo: substract mm1 from mm2 with saturation and vice versa */
1056
      /*       OR the results to get absolute difference */
1057
      "movq %%mm1,%%mm5\n"
1058
      "movq %%mm3,%%mm6\n"
1059
      "psubusb %%mm2,%%mm1\n"
1060
      "psubusb %%mm4,%%mm3\n"
1061
      "psubusb %%mm5,%%mm2\n"
1062
      "psubusb %%mm6,%%mm4\n"
1063

    
1064
      "por %%mm1,%%mm2\n"
1065
      "por %%mm3,%%mm4\n"
1066

    
1067
      /* now convert to 16-bit vectors so we can square them */
1068
      "movq %%mm2,%%mm1\n"
1069
      "movq %%mm4,%%mm3\n"
1070

    
1071
      "punpckhbw %%mm0,%%mm2\n"
1072
      "punpckhbw %%mm0,%%mm4\n"
1073
      "punpcklbw %%mm0,%%mm1\n"        /* mm1 now spread over (mm1,mm2) */
1074
      "punpcklbw %%mm0,%%mm3\n"        /* mm4 now spread over (mm3,mm4) */
1075

    
1076
      "pmaddwd %%mm2,%%mm2\n"
1077
      "pmaddwd %%mm4,%%mm4\n"
1078
      "pmaddwd %%mm1,%%mm1\n"
1079
      "pmaddwd %%mm3,%%mm3\n"
1080

    
1081
      "add %3,%0\n"
1082
      "add %3,%1\n"
1083

    
1084
      "paddd %%mm2,%%mm1\n"
1085
      "paddd %%mm4,%%mm3\n"
1086
      "paddd %%mm1,%%mm7\n"
1087
      "paddd %%mm3,%%mm7\n"
1088

    
1089
      "decl %%ecx\n"
1090
      "jnz 1b\n"
1091

    
1092
      "movq %%mm7,%%mm1\n"
1093
      "psrlq $32, %%mm7\n"        /* shift hi dword to lo */
1094
      "paddd %%mm7,%%mm1\n"
1095
      "movd %%mm1,%2\n"
1096
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
1097
      : "r" ((long)line_size) , "m" (h)
1098
      : "%ecx");
1099
    return tmp;
1100
}
1101

    
1102
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
1103
    int tmp;
1104
  asm volatile (
1105
      "movl %3,%%ecx\n"
1106
      "pxor %%mm7,%%mm7\n"
1107
      "pxor %%mm6,%%mm6\n"
1108
      
1109
      "movq (%0),%%mm0\n"
1110
      "movq %%mm0, %%mm1\n"
1111
      "psllq $8, %%mm0\n"
1112
      "psrlq $8, %%mm1\n"
1113
      "psrlq $8, %%mm0\n"
1114
      "movq %%mm0, %%mm2\n"
1115
      "movq %%mm1, %%mm3\n"
1116
      "punpcklbw %%mm7,%%mm0\n"
1117
      "punpcklbw %%mm7,%%mm1\n"
1118
      "punpckhbw %%mm7,%%mm2\n"
1119
      "punpckhbw %%mm7,%%mm3\n"
1120
      "psubw %%mm1, %%mm0\n"
1121
      "psubw %%mm3, %%mm2\n"
1122
      
1123
      "add %2,%0\n"
1124
      
1125
      "movq (%0),%%mm4\n"
1126
      "movq %%mm4, %%mm1\n"
1127
      "psllq $8, %%mm4\n"
1128
      "psrlq $8, %%mm1\n"
1129
      "psrlq $8, %%mm4\n"
1130
      "movq %%mm4, %%mm5\n"
1131
      "movq %%mm1, %%mm3\n"
1132
      "punpcklbw %%mm7,%%mm4\n"
1133
      "punpcklbw %%mm7,%%mm1\n"
1134
      "punpckhbw %%mm7,%%mm5\n"
1135
      "punpckhbw %%mm7,%%mm3\n"
1136
      "psubw %%mm1, %%mm4\n"
1137
      "psubw %%mm3, %%mm5\n"
1138
      "psubw %%mm4, %%mm0\n"
1139
      "psubw %%mm5, %%mm2\n"
1140
      "pxor %%mm3, %%mm3\n"
1141
      "pxor %%mm1, %%mm1\n"
1142
      "pcmpgtw %%mm0, %%mm3\n\t"
1143
      "pcmpgtw %%mm2, %%mm1\n\t"
1144
      "pxor %%mm3, %%mm0\n"
1145
      "pxor %%mm1, %%mm2\n"
1146
      "psubw %%mm3, %%mm0\n" 
1147
      "psubw %%mm1, %%mm2\n"
1148
      "paddw %%mm0, %%mm2\n"
1149
      "paddw %%mm2, %%mm6\n"
1150

    
1151
      "add %2,%0\n"
1152
      "1:\n"
1153
  
1154
      "movq (%0),%%mm0\n"
1155
      "movq %%mm0, %%mm1\n"
1156
      "psllq $8, %%mm0\n"
1157
      "psrlq $8, %%mm1\n"
1158
      "psrlq $8, %%mm0\n"
1159
      "movq %%mm0, %%mm2\n"
1160
      "movq %%mm1, %%mm3\n"
1161
      "punpcklbw %%mm7,%%mm0\n"
1162
      "punpcklbw %%mm7,%%mm1\n"
1163
      "punpckhbw %%mm7,%%mm2\n"
1164
      "punpckhbw %%mm7,%%mm3\n"
1165
      "psubw %%mm1, %%mm0\n"
1166
      "psubw %%mm3, %%mm2\n"
1167
      "psubw %%mm0, %%mm4\n"
1168
      "psubw %%mm2, %%mm5\n"
1169
      "pxor %%mm3, %%mm3\n"
1170
      "pxor %%mm1, %%mm1\n"
1171
      "pcmpgtw %%mm4, %%mm3\n\t"
1172
      "pcmpgtw %%mm5, %%mm1\n\t"
1173
      "pxor %%mm3, %%mm4\n"
1174
      "pxor %%mm1, %%mm5\n"
1175
      "psubw %%mm3, %%mm4\n" 
1176
      "psubw %%mm1, %%mm5\n"
1177
      "paddw %%mm4, %%mm5\n"
1178
      "paddw %%mm5, %%mm6\n"
1179
      
1180
      "add %2,%0\n"
1181
      
1182
      "movq (%0),%%mm4\n"
1183
      "movq %%mm4, %%mm1\n"
1184
      "psllq $8, %%mm4\n"
1185
      "psrlq $8, %%mm1\n"
1186
      "psrlq $8, %%mm4\n"
1187
      "movq %%mm4, %%mm5\n"
1188
      "movq %%mm1, %%mm3\n"
1189
      "punpcklbw %%mm7,%%mm4\n"
1190
      "punpcklbw %%mm7,%%mm1\n"
1191
      "punpckhbw %%mm7,%%mm5\n"
1192
      "punpckhbw %%mm7,%%mm3\n"
1193
      "psubw %%mm1, %%mm4\n"
1194
      "psubw %%mm3, %%mm5\n"
1195
      "psubw %%mm4, %%mm0\n"
1196
      "psubw %%mm5, %%mm2\n"
1197
      "pxor %%mm3, %%mm3\n"
1198
      "pxor %%mm1, %%mm1\n"
1199
      "pcmpgtw %%mm0, %%mm3\n\t"
1200
      "pcmpgtw %%mm2, %%mm1\n\t"
1201
      "pxor %%mm3, %%mm0\n"
1202
      "pxor %%mm1, %%mm2\n"
1203
      "psubw %%mm3, %%mm0\n" 
1204
      "psubw %%mm1, %%mm2\n"
1205
      "paddw %%mm0, %%mm2\n"
1206
      "paddw %%mm2, %%mm6\n"
1207

    
1208
      "add %2,%0\n"
1209
      "subl $2, %%ecx\n"
1210
      " jnz 1b\n"
1211

    
1212
      "movq %%mm6, %%mm0\n"
1213
      "punpcklwd %%mm7,%%mm0\n"
1214
      "punpckhwd %%mm7,%%mm6\n"
1215
      "paddd %%mm0, %%mm6\n"
1216
      
1217
      "movq %%mm6,%%mm0\n"
1218
      "psrlq $32, %%mm6\n"
1219
      "paddd %%mm6,%%mm0\n"
1220
      "movd %%mm0,%1\n"
1221
      : "+r" (pix1), "=r"(tmp) 
1222
      : "r" ((long)line_size) , "g" (h-2)
1223
      : "%ecx");
1224
      return tmp;
1225
}
1226

    
1227
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1228
    int tmp;
1229
    uint8_t * pix= pix1;
1230
  asm volatile (
1231
      "movl %3,%%ecx\n"
1232
      "pxor %%mm7,%%mm7\n"
1233
      "pxor %%mm6,%%mm6\n"
1234
      
1235
      "movq (%0),%%mm0\n"
1236
      "movq 1(%0),%%mm1\n"
1237
      "movq %%mm0, %%mm2\n"
1238
      "movq %%mm1, %%mm3\n"
1239
      "punpcklbw %%mm7,%%mm0\n"
1240
      "punpcklbw %%mm7,%%mm1\n"
1241
      "punpckhbw %%mm7,%%mm2\n"
1242
      "punpckhbw %%mm7,%%mm3\n"
1243
      "psubw %%mm1, %%mm0\n"
1244
      "psubw %%mm3, %%mm2\n"
1245
      
1246
      "add %2,%0\n"
1247
      
1248
      "movq (%0),%%mm4\n"
1249
      "movq 1(%0),%%mm1\n"
1250
      "movq %%mm4, %%mm5\n"
1251
      "movq %%mm1, %%mm3\n"
1252
      "punpcklbw %%mm7,%%mm4\n"
1253
      "punpcklbw %%mm7,%%mm1\n"
1254
      "punpckhbw %%mm7,%%mm5\n"
1255
      "punpckhbw %%mm7,%%mm3\n"
1256
      "psubw %%mm1, %%mm4\n"
1257
      "psubw %%mm3, %%mm5\n"
1258
      "psubw %%mm4, %%mm0\n"
1259
      "psubw %%mm5, %%mm2\n"
1260
      "pxor %%mm3, %%mm3\n"
1261
      "pxor %%mm1, %%mm1\n"
1262
      "pcmpgtw %%mm0, %%mm3\n\t"
1263
      "pcmpgtw %%mm2, %%mm1\n\t"
1264
      "pxor %%mm3, %%mm0\n"
1265
      "pxor %%mm1, %%mm2\n"
1266
      "psubw %%mm3, %%mm0\n" 
1267
      "psubw %%mm1, %%mm2\n"
1268
      "paddw %%mm0, %%mm2\n"
1269
      "paddw %%mm2, %%mm6\n"
1270

    
1271
      "add %2,%0\n"
1272
      "1:\n"
1273
  
1274
      "movq (%0),%%mm0\n"
1275
      "movq 1(%0),%%mm1\n"
1276
      "movq %%mm0, %%mm2\n"
1277
      "movq %%mm1, %%mm3\n"
1278
      "punpcklbw %%mm7,%%mm0\n"
1279
      "punpcklbw %%mm7,%%mm1\n"
1280
      "punpckhbw %%mm7,%%mm2\n"
1281
      "punpckhbw %%mm7,%%mm3\n"
1282
      "psubw %%mm1, %%mm0\n"
1283
      "psubw %%mm3, %%mm2\n"
1284
      "psubw %%mm0, %%mm4\n"
1285
      "psubw %%mm2, %%mm5\n"
1286
      "pxor %%mm3, %%mm3\n"
1287
      "pxor %%mm1, %%mm1\n"
1288
      "pcmpgtw %%mm4, %%mm3\n\t"
1289
      "pcmpgtw %%mm5, %%mm1\n\t"
1290
      "pxor %%mm3, %%mm4\n"
1291
      "pxor %%mm1, %%mm5\n"
1292
      "psubw %%mm3, %%mm4\n"
1293
      "psubw %%mm1, %%mm5\n"
1294
      "paddw %%mm4, %%mm5\n"
1295
      "paddw %%mm5, %%mm6\n"
1296
      
1297
      "add %2,%0\n"
1298
      
1299
      "movq (%0),%%mm4\n"
1300
      "movq 1(%0),%%mm1\n"
1301
      "movq %%mm4, %%mm5\n"
1302
      "movq %%mm1, %%mm3\n"
1303
      "punpcklbw %%mm7,%%mm4\n"
1304
      "punpcklbw %%mm7,%%mm1\n"
1305
      "punpckhbw %%mm7,%%mm5\n"
1306
      "punpckhbw %%mm7,%%mm3\n"
1307
      "psubw %%mm1, %%mm4\n"
1308
      "psubw %%mm3, %%mm5\n"
1309
      "psubw %%mm4, %%mm0\n"
1310
      "psubw %%mm5, %%mm2\n"
1311
      "pxor %%mm3, %%mm3\n"
1312
      "pxor %%mm1, %%mm1\n"
1313
      "pcmpgtw %%mm0, %%mm3\n\t"
1314
      "pcmpgtw %%mm2, %%mm1\n\t"
1315
      "pxor %%mm3, %%mm0\n"
1316
      "pxor %%mm1, %%mm2\n"
1317
      "psubw %%mm3, %%mm0\n" 
1318
      "psubw %%mm1, %%mm2\n"
1319
      "paddw %%mm0, %%mm2\n"
1320
      "paddw %%mm2, %%mm6\n"
1321

    
1322
      "add %2,%0\n"
1323
      "subl $2, %%ecx\n"
1324
      " jnz 1b\n"
1325

    
1326
      "movq %%mm6, %%mm0\n"
1327
      "punpcklwd %%mm7,%%mm0\n"
1328
      "punpckhwd %%mm7,%%mm6\n"
1329
      "paddd %%mm0, %%mm6\n"
1330
      
1331
      "movq %%mm6,%%mm0\n"
1332
      "psrlq $32, %%mm6\n"
1333
      "paddd %%mm6,%%mm0\n"
1334
      "movd %%mm0,%1\n"
1335
      : "+r" (pix1), "=r"(tmp) 
1336
      : "r" ((long)line_size) , "g" (h-2)
1337
      : "%ecx");
1338
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1339
}
1340

    
1341
static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1342
    int score1= sse16_mmx(c, pix1, pix2, line_size, h);
1343
    int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1344

    
1345
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
1346
    else  return score1 + ABS(score2)*8;
1347
}
1348

    
1349
static int nsse8_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1350
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1351
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1352

    
1353
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
1354
    else  return score1 + ABS(score2)*8;
1355
}
1356

    
1357
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1358
    int tmp;
1359
    
1360
    assert( (((int)pix) & 7) == 0);
1361
    assert((line_size &7) ==0);
1362
    
1363
#define SUM(in0, in1, out0, out1) \
1364
      "movq (%0), %%mm2\n"\
1365
      "movq 8(%0), %%mm3\n"\
1366
      "add %2,%0\n"\
1367
      "movq %%mm2, " #out0 "\n"\
1368
      "movq %%mm3, " #out1 "\n"\
1369
      "psubusb " #in0 ", %%mm2\n"\
1370
      "psubusb " #in1 ", %%mm3\n"\
1371
      "psubusb " #out0 ", " #in0 "\n"\
1372
      "psubusb " #out1 ", " #in1 "\n"\
1373
      "por %%mm2, " #in0 "\n"\
1374
      "por %%mm3, " #in1 "\n"\
1375
      "movq " #in0 ", %%mm2\n"\
1376
      "movq " #in1 ", %%mm3\n"\
1377
      "punpcklbw %%mm7, " #in0 "\n"\
1378
      "punpcklbw %%mm7, " #in1 "\n"\
1379
      "punpckhbw %%mm7, %%mm2\n"\
1380
      "punpckhbw %%mm7, %%mm3\n"\
1381
      "paddw " #in1 ", " #in0 "\n"\
1382
      "paddw %%mm3, %%mm2\n"\
1383
      "paddw %%mm2, " #in0 "\n"\
1384
      "paddw " #in0 ", %%mm6\n"
1385

    
1386
    
1387
  asm volatile (
1388
      "movl %3,%%ecx\n"
1389
      "pxor %%mm6,%%mm6\n"
1390
      "pxor %%mm7,%%mm7\n"
1391
      "movq (%0),%%mm0\n"
1392
      "movq 8(%0),%%mm1\n"
1393
      "add %2,%0\n"
1394
      "subl $2, %%ecx\n"
1395
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1396
      "1:\n"
1397
      
1398
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1399
      
1400
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1401
      
1402
      "subl $2, %%ecx\n"
1403
      "jnz 1b\n"
1404

    
1405
      "movq %%mm6,%%mm0\n"
1406
      "psrlq $32, %%mm6\n"
1407
      "paddw %%mm6,%%mm0\n"
1408
      "movq %%mm0,%%mm6\n"
1409
      "psrlq $16, %%mm0\n"
1410
      "paddw %%mm6,%%mm0\n"
1411
      "movd %%mm0,%1\n"
1412
      : "+r" (pix), "=r"(tmp) 
1413
      : "r" ((long)line_size) , "m" (h)
1414
      : "%ecx");
1415
    return tmp & 0xFFFF;
1416
}
1417
#undef SUM
1418

    
1419
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1420
    int tmp;
1421
    
1422
    assert( (((int)pix) & 7) == 0);
1423
    assert((line_size &7) ==0);
1424
    
1425
#define SUM(in0, in1, out0, out1) \
1426
      "movq (%0), " #out0 "\n"\
1427
      "movq 8(%0), " #out1 "\n"\
1428
      "add %2,%0\n"\
1429
      "psadbw " #out0 ", " #in0 "\n"\
1430
      "psadbw " #out1 ", " #in1 "\n"\
1431
      "paddw " #in1 ", " #in0 "\n"\
1432
      "paddw " #in0 ", %%mm6\n"
1433

    
1434
  asm volatile (
1435
      "movl %3,%%ecx\n"
1436
      "pxor %%mm6,%%mm6\n"
1437
      "pxor %%mm7,%%mm7\n"
1438
      "movq (%0),%%mm0\n"
1439
      "movq 8(%0),%%mm1\n"
1440
      "add %2,%0\n"
1441
      "subl $2, %%ecx\n"
1442
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1443
      "1:\n"
1444
      
1445
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1446
      
1447
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1448
      
1449
      "subl $2, %%ecx\n"
1450
      "jnz 1b\n"
1451

    
1452
      "movd %%mm6,%1\n"
1453
      : "+r" (pix), "=r"(tmp) 
1454
      : "r" ((long)line_size) , "m" (h)
1455
      : "%ecx");
1456
    return tmp;
1457
}
1458
#undef SUM
1459

    
1460
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1461
    int tmp;
1462
    
1463
    assert( (((int)pix1) & 7) == 0);
1464
    assert( (((int)pix2) & 7) == 0);
1465
    assert((line_size &7) ==0);
1466
    
1467
#define SUM(in0, in1, out0, out1) \
1468
      "movq (%0),%%mm2\n"\
1469
      "movq (%1)," #out0 "\n"\
1470
      "movq 8(%0),%%mm3\n"\
1471
      "movq 8(%1)," #out1 "\n"\
1472
      "add %3,%0\n"\
1473
      "add %3,%1\n"\
1474
      "psubb " #out0 ", %%mm2\n"\
1475
      "psubb " #out1 ", %%mm3\n"\
1476
      "pxor %%mm7, %%mm2\n"\
1477
      "pxor %%mm7, %%mm3\n"\
1478
      "movq %%mm2, " #out0 "\n"\
1479
      "movq %%mm3, " #out1 "\n"\
1480
      "psubusb " #in0 ", %%mm2\n"\
1481
      "psubusb " #in1 ", %%mm3\n"\
1482
      "psubusb " #out0 ", " #in0 "\n"\
1483
      "psubusb " #out1 ", " #in1 "\n"\
1484
      "por %%mm2, " #in0 "\n"\
1485
      "por %%mm3, " #in1 "\n"\
1486
      "movq " #in0 ", %%mm2\n"\
1487
      "movq " #in1 ", %%mm3\n"\
1488
      "punpcklbw %%mm7, " #in0 "\n"\
1489
      "punpcklbw %%mm7, " #in1 "\n"\
1490
      "punpckhbw %%mm7, %%mm2\n"\
1491
      "punpckhbw %%mm7, %%mm3\n"\
1492
      "paddw " #in1 ", " #in0 "\n"\
1493
      "paddw %%mm3, %%mm2\n"\
1494
      "paddw %%mm2, " #in0 "\n"\
1495
      "paddw " #in0 ", %%mm6\n"
1496

    
1497
    
1498
  asm volatile (
1499
      "movl %4,%%ecx\n"
1500
      "pxor %%mm6,%%mm6\n"
1501
      "pcmpeqw %%mm7,%%mm7\n"
1502
      "psllw $15, %%mm7\n"
1503
      "packsswb %%mm7, %%mm7\n"
1504
      "movq (%0),%%mm0\n"
1505
      "movq (%1),%%mm2\n"
1506
      "movq 8(%0),%%mm1\n"
1507
      "movq 8(%1),%%mm3\n"
1508
      "add %3,%0\n"
1509
      "add %3,%1\n"
1510
      "subl $2, %%ecx\n"
1511
      "psubb %%mm2, %%mm0\n"
1512
      "psubb %%mm3, %%mm1\n"
1513
      "pxor %%mm7, %%mm0\n"
1514
      "pxor %%mm7, %%mm1\n"
1515
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1516
      "1:\n"
1517
      
1518
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1519
      
1520
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1521
      
1522
      "subl $2, %%ecx\n"
1523
      "jnz 1b\n"
1524

    
1525
      "movq %%mm6,%%mm0\n"
1526
      "psrlq $32, %%mm6\n"
1527
      "paddw %%mm6,%%mm0\n"
1528
      "movq %%mm0,%%mm6\n"
1529
      "psrlq $16, %%mm0\n"
1530
      "paddw %%mm6,%%mm0\n"
1531
      "movd %%mm0,%2\n"
1532
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
1533
      : "r" ((long)line_size) , "m" (h)
1534
      : "%ecx");
1535
    return tmp & 0x7FFF;
1536
}
1537
#undef SUM
1538

    
1539
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1540
    int tmp;
1541
    
1542
    assert( (((int)pix1) & 7) == 0);
1543
    assert( (((int)pix2) & 7) == 0);
1544
    assert((line_size &7) ==0);
1545
    
1546
#define SUM(in0, in1, out0, out1) \
1547
      "movq (%0)," #out0 "\n"\
1548
      "movq (%1),%%mm2\n"\
1549
      "movq 8(%0)," #out1 "\n"\
1550
      "movq 8(%1),%%mm3\n"\
1551
      "add %3,%0\n"\
1552
      "add %3,%1\n"\
1553
      "psubb %%mm2, " #out0 "\n"\
1554
      "psubb %%mm3, " #out1 "\n"\
1555
      "pxor %%mm7, " #out0 "\n"\
1556
      "pxor %%mm7, " #out1 "\n"\
1557
      "psadbw " #out0 ", " #in0 "\n"\
1558
      "psadbw " #out1 ", " #in1 "\n"\
1559
      "paddw " #in1 ", " #in0 "\n"\
1560
      "paddw " #in0 ", %%mm6\n"
1561

    
1562
  asm volatile (
1563
      "movl %4,%%ecx\n"
1564
      "pxor %%mm6,%%mm6\n"
1565
      "pcmpeqw %%mm7,%%mm7\n"
1566
      "psllw $15, %%mm7\n"
1567
      "packsswb %%mm7, %%mm7\n"
1568
      "movq (%0),%%mm0\n"
1569
      "movq (%1),%%mm2\n"
1570
      "movq 8(%0),%%mm1\n"
1571
      "movq 8(%1),%%mm3\n"
1572
      "add %3,%0\n"
1573
      "add %3,%1\n"
1574
      "subl $2, %%ecx\n"
1575
      "psubb %%mm2, %%mm0\n"
1576
      "psubb %%mm3, %%mm1\n"
1577
      "pxor %%mm7, %%mm0\n"
1578
      "pxor %%mm7, %%mm1\n"
1579
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1580
      "1:\n"
1581
      
1582
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1583
      
1584
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1585
      
1586
      "subl $2, %%ecx\n"
1587
      "jnz 1b\n"
1588

    
1589
      "movd %%mm6,%2\n"
1590
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
1591
      : "r" ((long)line_size) , "m" (h)
1592
      : "%ecx");
1593
    return tmp;
1594
}
1595
#undef SUM
1596

    
1597
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1598
    long i=0;
1599
    asm volatile(
1600
        "1:                                \n\t"
1601
        "movq  (%2, %0), %%mm0                \n\t"
1602
        "movq  (%1, %0), %%mm1                \n\t"
1603
        "psubb %%mm0, %%mm1                \n\t"
1604
        "movq %%mm1, (%3, %0)                \n\t"
1605
        "movq 8(%2, %0), %%mm0                \n\t"
1606
        "movq 8(%1, %0), %%mm1                \n\t"
1607
        "psubb %%mm0, %%mm1                \n\t"
1608
        "movq %%mm1, 8(%3, %0)                \n\t"
1609
        "add $16, %0                        \n\t"
1610
        "cmp %4, %0                        \n\t"
1611
        " jb 1b                                \n\t"
1612
        : "+r" (i)
1613
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1614
    );
1615
    for(; i<w; i++)
1616
        dst[i+0] = src1[i+0]-src2[i+0];
1617
}
1618

    
1619
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1620
    long i=0;
1621
    uint8_t l, lt;
1622
    
1623
    asm volatile(
1624
        "1:                                \n\t"
1625
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1626
        "movq  (%1, %0), %%mm1                \n\t" // T
1627
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1628
        "movq  (%2, %0), %%mm3                \n\t" // X
1629
        "movq %%mm2, %%mm4                \n\t" // L
1630
        "psubb %%mm0, %%mm2                \n\t"
1631
        "paddb %%mm1, %%mm2                \n\t" // L + T - LT
1632
        "movq %%mm4, %%mm5                \n\t" // L
1633
        "pmaxub %%mm1, %%mm4                \n\t" // max(T, L)
1634
        "pminub %%mm5, %%mm1                \n\t" // min(T, L)
1635
        "pminub %%mm2, %%mm4                \n\t" 
1636
        "pmaxub %%mm1, %%mm4                \n\t"
1637
        "psubb %%mm4, %%mm3                \n\t" // dst - pred
1638
        "movq %%mm3, (%3, %0)                \n\t"
1639
        "add $8, %0                        \n\t"
1640
        "cmp %4, %0                        \n\t"
1641
        " jb 1b                                \n\t"
1642
        : "+r" (i)
1643
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1644
    );
1645

    
1646
    l= *left;
1647
    lt= *left_top;
1648
    
1649
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1650
    
1651
    *left_top= src1[w-1];
1652
    *left    = src2[w-1];
1653
}
1654

    
1655
#define LBUTTERFLY2(a1,b1,a2,b2)\
1656
    "paddw " #b1 ", " #a1 "                \n\t"\
1657
    "paddw " #b2 ", " #a2 "                \n\t"\
1658
    "paddw " #b1 ", " #b1 "                \n\t"\
1659
    "paddw " #b2 ", " #b2 "                \n\t"\
1660
    "psubw " #a1 ", " #b1 "                \n\t"\
1661
    "psubw " #a2 ", " #b2 "                \n\t"
1662

    
1663
#define HADAMARD48\
1664
        LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
1665
        LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
1666
        LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
1667
        LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
1668
        LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
1669
        LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
1670

    
1671
#define MMABS(a,z)\
1672
    "pxor " #z ", " #z "                \n\t"\
1673
    "pcmpgtw " #a ", " #z "                \n\t"\
1674
    "pxor " #z ", " #a "                \n\t"\
1675
    "psubw " #z ", " #a "                \n\t"
1676

    
1677
#define MMABS_SUM(a,z, sum)\
1678
    "pxor " #z ", " #z "                \n\t"\
1679
    "pcmpgtw " #a ", " #z "                \n\t"\
1680
    "pxor " #z ", " #a "                \n\t"\
1681
    "psubw " #z ", " #a "                \n\t"\
1682
    "paddusw " #a ", " #sum "                \n\t"
1683

    
1684
#define MMABS_MMX2(a,z)\
1685
    "pxor " #z ", " #z "                \n\t"\
1686
    "psubw " #a ", " #z "                \n\t"\
1687
    "pmaxsw " #z ", " #a "                \n\t"
1688

    
1689
#define MMABS_SUM_MMX2(a,z, sum)\
1690
    "pxor " #z ", " #z "                \n\t"\
1691
    "psubw " #a ", " #z "                \n\t"\
1692
    "pmaxsw " #z ", " #a "                \n\t"\
1693
    "paddusw " #a ", " #sum "                \n\t"
1694
        
1695
#define SBUTTERFLY(a,b,t,n)\
1696
    "movq " #a ", " #t "                \n\t" /* abcd */\
1697
    "punpckl" #n " " #b ", " #a "        \n\t" /* aebf */\
1698
    "punpckh" #n " " #b ", " #t "        \n\t" /* cgdh */\
1699

    
1700
#define TRANSPOSE4(a,b,c,d,t)\
1701
    SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
1702
    SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
1703
    SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
1704
    SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
1705

    
1706
#define LOAD4(o, a, b, c, d)\
1707
        "movq "#o"(%1), " #a "                \n\t"\
1708
        "movq "#o"+16(%1), " #b "        \n\t"\
1709
        "movq "#o"+32(%1), " #c "        \n\t"\
1710
        "movq "#o"+48(%1), " #d "        \n\t"
1711

    
1712
#define STORE4(o, a, b, c, d)\
1713
        "movq "#a", "#o"(%1)                \n\t"\
1714
        "movq "#b", "#o"+16(%1)                \n\t"\
1715
        "movq "#c", "#o"+32(%1)                \n\t"\
1716
        "movq "#d", "#o"+48(%1)                \n\t"\
1717

    
1718
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1719
    uint64_t temp[16] __align8;
1720
    int sum=0;
1721
    
1722
    assert(h==8);
1723

    
1724
    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1725

    
1726
    asm volatile(
1727
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1728
        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1729
        
1730
        HADAMARD48
1731
        
1732
        "movq %%mm7, 112(%1)                \n\t"
1733
        
1734
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1735
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1736
        
1737
        "movq 112(%1), %%mm7                 \n\t"
1738
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1739
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1740

    
1741
        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1742
        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1743
        
1744
        HADAMARD48
1745
        
1746
        "movq %%mm7, 120(%1)                \n\t"
1747
        
1748
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1749
        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1750
        
1751
        "movq 120(%1), %%mm7                 \n\t"
1752
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1753
        "movq %%mm7, %%mm5                \n\t"//FIXME remove
1754
        "movq %%mm6, %%mm7                \n\t"
1755
        "movq %%mm0, %%mm6                \n\t"
1756
//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1757
        
1758
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1759
//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1760
        
1761
        HADAMARD48
1762
        "movq %%mm7, 64(%1)                \n\t"
1763
        MMABS(%%mm0, %%mm7)
1764
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1765
        MMABS_SUM(%%mm2, %%mm7, %%mm0)
1766
        MMABS_SUM(%%mm3, %%mm7, %%mm0)
1767
        MMABS_SUM(%%mm4, %%mm7, %%mm0)
1768
        MMABS_SUM(%%mm5, %%mm7, %%mm0)
1769
        MMABS_SUM(%%mm6, %%mm7, %%mm0)
1770
        "movq 64(%1), %%mm1                \n\t"
1771
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1772
        "movq %%mm0, 64(%1)                \n\t"
1773
        
1774
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1775
        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1776
        
1777
        HADAMARD48
1778
        "movq %%mm7, (%1)                \n\t"
1779
        MMABS(%%mm0, %%mm7)
1780
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1781
        MMABS_SUM(%%mm2, %%mm7, %%mm0)
1782
        MMABS_SUM(%%mm3, %%mm7, %%mm0)
1783
        MMABS_SUM(%%mm4, %%mm7, %%mm0)
1784
        MMABS_SUM(%%mm5, %%mm7, %%mm0)
1785
        MMABS_SUM(%%mm6, %%mm7, %%mm0)
1786
        "movq (%1), %%mm1                \n\t"
1787
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1788
        "movq 64(%1), %%mm1                \n\t"
1789
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1790
        
1791
        "movq %%mm0, %%mm1                \n\t"
1792
        "psrlq $32, %%mm0                \n\t"
1793
        "paddusw %%mm1, %%mm0                \n\t"
1794
        "movq %%mm0, %%mm1                \n\t"
1795
        "psrlq $16, %%mm0                \n\t"
1796
        "paddusw %%mm1, %%mm0                \n\t"
1797
        "movd %%mm0, %0                        \n\t"
1798
                
1799
        : "=r" (sum)
1800
        : "r"(temp)
1801
    );
1802
    return sum&0xFFFF;
1803
}
1804

    
1805
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1806
    uint64_t temp[16] __align8;
1807
    int sum=0;
1808
    
1809
    assert(h==8);
1810

    
1811
    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1812

    
1813
    asm volatile(
1814
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1815
        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1816
        
1817
        HADAMARD48
1818
        
1819
        "movq %%mm7, 112(%1)                \n\t"
1820
        
1821
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1822
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1823
        
1824
        "movq 112(%1), %%mm7                 \n\t"
1825
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1826
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1827

    
1828
        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1829
        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1830
        
1831
        HADAMARD48
1832
        
1833
        "movq %%mm7, 120(%1)                \n\t"
1834
        
1835
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1836
        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1837
        
1838
        "movq 120(%1), %%mm7                 \n\t"
1839
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1840
        "movq %%mm7, %%mm5                \n\t"//FIXME remove
1841
        "movq %%mm6, %%mm7                \n\t"
1842
        "movq %%mm0, %%mm6                \n\t"
1843
//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1844
        
1845
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1846
//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1847
        
1848
        HADAMARD48
1849
        "movq %%mm7, 64(%1)                \n\t"
1850
        MMABS_MMX2(%%mm0, %%mm7)
1851
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1852
        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1853
        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1854
        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1855
        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1856
        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1857
        "movq 64(%1), %%mm1                \n\t"
1858
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1859
        "movq %%mm0, 64(%1)                \n\t"
1860
        
1861
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1862
        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1863
        
1864
        HADAMARD48
1865
        "movq %%mm7, (%1)                \n\t"
1866
        MMABS_MMX2(%%mm0, %%mm7)
1867
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1868
        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1869
        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1870
        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1871
        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1872
        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1873
        "movq (%1), %%mm1                \n\t"
1874
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1875
        "movq 64(%1), %%mm1                \n\t"
1876
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1877
        
1878
        "movq %%mm0, %%mm1                \n\t"
1879
        "psrlq $32, %%mm0                \n\t"
1880
        "paddusw %%mm1, %%mm0                \n\t"
1881
        "movq %%mm0, %%mm1                \n\t"
1882
        "psrlq $16, %%mm0                \n\t"
1883
        "paddusw %%mm1, %%mm0                \n\t"
1884
        "movd %%mm0, %0                        \n\t"
1885
                
1886
        : "=r" (sum)
1887
        : "r"(temp)
1888
    );
1889
    return sum&0xFFFF;
1890
}
1891

    
1892

    
1893
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
1894
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
1895
#endif //CONFIG_ENCODERS
1896

    
1897
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1898
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1899

    
1900
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1901
        "paddw " #m4 ", " #m3 "                \n\t" /* x1 */\
1902
        "movq "MANGLE(ff_pw_20)", %%mm4                \n\t" /* 20 */\
1903
        "pmullw " #m3 ", %%mm4                \n\t" /* 20x1 */\
1904
        "movq "#in7", " #m3 "                \n\t" /* d */\
1905
        "movq "#in0", %%mm5                \n\t" /* D */\
1906
        "paddw " #m3 ", %%mm5                \n\t" /* x4 */\
1907
        "psubw %%mm5, %%mm4                \n\t" /* 20x1 - x4 */\
1908
        "movq "#in1", %%mm5                \n\t" /* C */\
1909
        "movq "#in2", %%mm6                \n\t" /* B */\
1910
        "paddw " #m6 ", %%mm5                \n\t" /* x3 */\
1911
        "paddw " #m5 ", %%mm6                \n\t" /* x2 */\
1912
        "paddw %%mm6, %%mm6                \n\t" /* 2x2 */\
1913
        "psubw %%mm6, %%mm5                \n\t" /* -2x2 + x3 */\
1914
        "pmullw "MANGLE(ff_pw_3)", %%mm5        \n\t" /* -6x2 + 3x3 */\
1915
        "paddw " #rnd ", %%mm4                \n\t" /* x2 */\
1916
        "paddw %%mm4, %%mm5                \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1917
        "psraw $5, %%mm5                \n\t"\
1918
        "packuswb %%mm5, %%mm5                \n\t"\
1919
        OP(%%mm5, out, %%mm7, d)
1920

    
1921
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1922
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1923
    uint64_t temp;\
1924
\
1925
    asm volatile(\
1926
        "pxor %%mm7, %%mm7                \n\t"\
1927
        "1:                                \n\t"\
1928
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1929
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1930
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1931
        "punpcklbw %%mm7, %%mm0                \n\t" /* 0A0B0C0D */\
1932
        "punpckhbw %%mm7, %%mm1                \n\t" /* 0E0F0G0H */\
1933
        "pshufw $0x90, %%mm0, %%mm5        \n\t" /* 0A0A0B0C */\
1934
        "pshufw $0x41, %%mm0, %%mm6        \n\t" /* 0B0A0A0B */\
1935
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1936
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1937
        "psllq $8, %%mm2                \n\t" /* 0ABCDEFG */\
1938
        "psllq $16, %%mm3                \n\t" /* 00ABCDEF */\
1939
        "psllq $24, %%mm4                \n\t" /* 000ABCDE */\
1940
        "punpckhbw %%mm7, %%mm2                \n\t" /* 0D0E0F0G */\
1941
        "punpckhbw %%mm7, %%mm3                \n\t" /* 0C0D0E0F */\
1942
        "punpckhbw %%mm7, %%mm4                \n\t" /* 0B0C0D0E */\
1943
        "paddw %%mm3, %%mm5                \n\t" /* b */\
1944
        "paddw %%mm2, %%mm6                \n\t" /* c */\
1945
        "paddw %%mm5, %%mm5                \n\t" /* 2b */\
1946
        "psubw %%mm5, %%mm6                \n\t" /* c - 2b */\
1947
        "pshufw $0x06, %%mm0, %%mm5        \n\t" /* 0C0B0A0A */\
1948
        "pmullw "MANGLE(ff_pw_3)", %%mm6                \n\t" /* 3c - 6b */\
1949
        "paddw %%mm4, %%mm0                \n\t" /* a */\
1950
        "paddw %%mm1, %%mm5                \n\t" /* d */\
1951
        "pmullw "MANGLE(ff_pw_20)", %%mm0                \n\t" /* 20a */\
1952
        "psubw %%mm5, %%mm0                \n\t" /* 20a - d */\
1953
        "paddw %6, %%mm6                \n\t"\
1954
        "paddw %%mm6, %%mm0                \n\t" /* 20a - 6b + 3c - d */\
1955
        "psraw $5, %%mm0                \n\t"\
1956
        "movq %%mm0, %5                        \n\t"\
1957
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1958
        \
1959
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1960
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1961
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1962
        "psrlq $8, %%mm0                \n\t" /* GHIJKLM0 */\
1963
        "psrlq $16, %%mm5                \n\t" /* HIJKLM00 */\
1964
        "punpcklbw %%mm7, %%mm0                \n\t" /* 0G0H0I0J */\
1965
        "punpcklbw %%mm7, %%mm5                \n\t" /* 0H0I0J0K */\
1966
        "paddw %%mm0, %%mm2                \n\t" /* b */\
1967
        "paddw %%mm5, %%mm3                \n\t" /* c */\
1968
        "paddw %%mm2, %%mm2                \n\t" /* 2b */\
1969
        "psubw %%mm2, %%mm3                \n\t" /* c - 2b */\
1970
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1971
        "psrlq $24, %%mm6                \n\t" /* IJKLM000 */\
1972
        "punpcklbw %%mm7, %%mm2                \n\t" /* 0F0G0H0I */\
1973
        "punpcklbw %%mm7, %%mm6                \n\t" /* 0I0J0K0L */\
1974
        "pmullw "MANGLE(ff_pw_3)", %%mm3                \n\t" /* 3c - 6b */\
1975
        "paddw %%mm2, %%mm1                \n\t" /* a */\
1976
        "paddw %%mm6, %%mm4                \n\t" /* d */\
1977
        "pmullw "MANGLE(ff_pw_20)", %%mm1                \n\t" /* 20a */\
1978
        "psubw %%mm4, %%mm3                \n\t" /* - 6b +3c - d */\
1979
        "paddw %6, %%mm1                \n\t"\
1980
        "paddw %%mm1, %%mm3                \n\t" /* 20a - 6b +3c - d */\
1981
        "psraw $5, %%mm3                \n\t"\
1982
        "movq %5, %%mm1                        \n\t"\
1983
        "packuswb %%mm3, %%mm1                \n\t"\
1984
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
1985
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1986
        \
1987
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
1988
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
1989
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
1990
        "psrlq $8, %%mm1                \n\t" /* KLMNOPQ0 */\
1991
        "psrlq $16, %%mm4                \n\t" /* LMNOPQ00 */\
1992
        "punpcklbw %%mm7, %%mm1                \n\t" /* 0K0L0M0N */\
1993
        "punpcklbw %%mm7, %%mm4                \n\t" /* 0L0M0N0O */\
1994
        "paddw %%mm1, %%mm5                \n\t" /* b */\
1995
        "paddw %%mm4, %%mm0                \n\t" /* c */\
1996
        "paddw %%mm5, %%mm5                \n\t" /* 2b */\
1997
        "psubw %%mm5, %%mm0                \n\t" /* c - 2b */\
1998
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
1999
        "psrlq $24, %%mm3                \n\t" /* MNOPQ000 */\
2000
        "pmullw "MANGLE(ff_pw_3)", %%mm0                \n\t" /* 3c - 6b */\
2001
        "punpcklbw %%mm7, %%mm3                \n\t" /* 0M0N0O0P */\
2002
        "paddw %%mm3, %%mm2                \n\t" /* d */\
2003
        "psubw %%mm2, %%mm0                \n\t" /* -6b + 3c - d */\
2004
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
2005
        "punpcklbw %%mm7, %%mm2                \n\t" /* 0J0K0L0M */\
2006
        "punpckhbw %%mm7, %%mm5                \n\t" /* 0N0O0P0Q */\
2007
        "paddw %%mm2, %%mm6                \n\t" /* a */\
2008
        "pmullw "MANGLE(ff_pw_20)", %%mm6                \n\t" /* 20a */\
2009
        "paddw %6, %%mm0                \n\t"\
2010
        "paddw %%mm6, %%mm0                \n\t" /* 20a - 6b + 3c - d */\
2011
        "psraw $5, %%mm0                \n\t"\
2012
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2013
        \
2014
        "paddw %%mm5, %%mm3                \n\t" /* a */\
2015
        "pshufw $0xF9, %%mm5, %%mm6        \n\t" /* 0O0P0Q0Q */\
2016
        "paddw %%mm4, %%mm6                \n\t" /* b */\
2017
        "pshufw $0xBE, %%mm5, %%mm4        \n\t" /* 0P0Q0Q0P */\
2018
        "pshufw $0x6F, %%mm5, %%mm5        \n\t" /* 0Q0Q0P0O */\
2019
        "paddw %%mm1, %%mm4                \n\t" /* c */\
2020
        "paddw %%mm2, %%mm5                \n\t" /* d */\
2021
        "paddw %%mm6, %%mm6                \n\t" /* 2b */\
2022
        "psubw %%mm6, %%mm4                \n\t" /* c - 2b */\
2023
        "pmullw "MANGLE(ff_pw_20)", %%mm3                \n\t" /* 20a */\
2024
        "pmullw "MANGLE(ff_pw_3)", %%mm4                \n\t" /* 3c - 6b */\
2025
        "psubw %%mm5, %%mm3                \n\t" /* -6b + 3c - d */\
2026
        "paddw %6, %%mm4                \n\t"\
2027
        "paddw %%mm3, %%mm4                \n\t" /* 20a - 6b + 3c - d */\
2028
        "psraw $5, %%mm4                \n\t"\
2029
        "packuswb %%mm4, %%mm0                \n\t"\
2030
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2031
        \
2032
        "add %3, %0                        \n\t"\
2033
        "add %4, %1                        \n\t"\
2034
        "decl %2                        \n\t"\
2035
        " jnz 1b                                \n\t"\
2036
        : "+a"(src), "+c"(dst), "+m"(h)\
2037
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2038
        : "memory"\
2039
    );\
2040
}\
2041
\
2042
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2043
    int i;\
2044
    int16_t temp[16];\
2045
    /* quick HACK, XXX FIXME MUST be optimized */\
2046
    for(i=0; i<h; i++)\
2047
    {\
2048
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2049
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2050
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2051
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2052
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2053
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2054
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2055
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2056
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2057
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2058
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2059
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2060
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2061
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2062
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2063
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2064
        asm volatile(\
2065
            "movq (%0), %%mm0                \n\t"\
2066
            "movq 8(%0), %%mm1                \n\t"\
2067
            "paddw %2, %%mm0                \n\t"\
2068
            "paddw %2, %%mm1                \n\t"\
2069
            "psraw $5, %%mm0                \n\t"\
2070
            "psraw $5, %%mm1                \n\t"\
2071
            "packuswb %%mm1, %%mm0        \n\t"\
2072
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2073
            "movq 16(%0), %%mm0                \n\t"\
2074
            "movq 24(%0), %%mm1                \n\t"\
2075
            "paddw %2, %%mm0                \n\t"\
2076
            "paddw %2, %%mm1                \n\t"\
2077
            "psraw $5, %%mm0                \n\t"\
2078
            "psraw $5, %%mm1                \n\t"\
2079
            "packuswb %%mm1, %%mm0        \n\t"\
2080
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2081
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2082
            : "memory"\
2083
        );\
2084
        dst+=dstStride;\
2085
        src+=srcStride;\
2086
    }\
2087
}\
2088
\
2089
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2090
    uint64_t temp;\
2091
\
2092
    asm volatile(\
2093
        "pxor %%mm7, %%mm7                \n\t"\
2094
        "1:                                \n\t"\
2095
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2096
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2097
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2098
        "punpcklbw %%mm7, %%mm0                \n\t" /* 0A0B0C0D */\
2099
        "punpckhbw %%mm7, %%mm1                \n\t" /* 0E0F0G0H */\
2100
        "pshufw $0x90, %%mm0, %%mm5        \n\t" /* 0A0A0B0C */\
2101
        "pshufw $0x41, %%mm0, %%mm6        \n\t" /* 0B0A0A0B */\
2102
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2103
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2104
        "psllq $8, %%mm2                \n\t" /* 0ABCDEFG */\
2105
        "psllq $16, %%mm3                \n\t" /* 00ABCDEF */\
2106
        "psllq $24, %%mm4                \n\t" /* 000ABCDE */\
2107
        "punpckhbw %%mm7, %%mm2                \n\t" /* 0D0E0F0G */\
2108
        "punpckhbw %%mm7, %%mm3                \n\t" /* 0C0D0E0F */\
2109
        "punpckhbw %%mm7, %%mm4                \n\t" /* 0B0C0D0E */\
2110
        "paddw %%mm3, %%mm5                \n\t" /* b */\
2111
        "paddw %%mm2, %%mm6                \n\t" /* c */\
2112
        "paddw %%mm5, %%mm5                \n\t" /* 2b */\
2113
        "psubw %%mm5, %%mm6                \n\t" /* c - 2b */\
2114
        "pshufw $0x06, %%mm0, %%mm5        \n\t" /* 0C0B0A0A */\
2115
        "pmullw "MANGLE(ff_pw_3)", %%mm6                \n\t" /* 3c - 6b */\
2116
        "paddw %%mm4, %%mm0                \n\t" /* a */\
2117
        "paddw %%mm1, %%mm5                \n\t" /* d */\
2118
        "pmullw "MANGLE(ff_pw_20)", %%mm0                \n\t" /* 20a */\
2119
        "psubw %%mm5, %%mm0                \n\t" /* 20a - d */\
2120
        "paddw %6, %%mm6                \n\t"\
2121
        "paddw %%mm6, %%mm0                \n\t" /* 20a - 6b + 3c - d */\
2122
        "psraw $5, %%mm0                \n\t"\
2123
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2124
        \
2125
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2126
        "punpcklbw %%mm7, %%mm5                \n\t" /* 0F0G0H0I */\
2127
        "pshufw $0xF9, %%mm5, %%mm6        \n\t" /* 0G0H0I0I */\
2128
        "paddw %%mm5, %%mm1                \n\t" /* a */\
2129
        "paddw %%mm6, %%mm2                \n\t" /* b */\
2130
        "pshufw $0xBE, %%mm5, %%mm6        \n\t" /* 0H0I0I0H */\
2131
        "pshufw $0x6F, %%mm5, %%mm5        \n\t" /* 0I0I0H0G */\
2132
        "paddw %%mm6, %%mm3                \n\t" /* c */\
2133
        "paddw %%mm5, %%mm4                \n\t" /* d */\
2134
        "paddw %%mm2, %%mm2                \n\t" /* 2b */\
2135
        "psubw %%mm2, %%mm3                \n\t" /* c - 2b */\
2136
        "pmullw "MANGLE(ff_pw_20)", %%mm1                \n\t" /* 20a */\
2137
        "pmullw "MANGLE(ff_pw_3)", %%mm3                \n\t" /* 3c - 6b */\
2138
        "psubw %%mm4, %%mm3                \n\t" /* -6b + 3c - d */\
2139
        "paddw %6, %%mm1                \n\t"\
2140
        "paddw %%mm1, %%mm3                \n\t" /* 20a - 6b + 3c - d */\
2141
        "psraw $5, %%mm3                \n\t"\
2142
        "packuswb %%mm3, %%mm0                \n\t"\
2143
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2144
        \
2145
        "add %3, %0                        \n\t"\
2146
        "add %4, %1                        \n\t"\
2147
        "decl %2                        \n\t"\
2148
        " jnz 1b                        \n\t"\
2149
        : "+a"(src), "+c"(dst), "+m"(h)\
2150
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2151
        : "memory"\
2152
    );\
2153
}\
2154
\
2155
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2156
    int i;\
2157
    int16_t temp[8];\
2158
    /* quick HACK, XXX FIXME MUST be optimized */\
2159
    for(i=0; i<h; i++)\
2160
    {\
2161
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2162
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2163
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2164
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2165
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2166
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2167
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2168
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2169
        asm volatile(\
2170
            "movq (%0), %%mm0                \n\t"\
2171
            "movq 8(%0), %%mm1                \n\t"\
2172
            "paddw %2, %%mm0                \n\t"\
2173
            "paddw %2, %%mm1                \n\t"\
2174
            "psraw $5, %%mm0                \n\t"\
2175
            "psraw $5, %%mm1                \n\t"\
2176
            "packuswb %%mm1, %%mm0        \n\t"\
2177
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2178
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2179
            :"memory"\
2180
        );\
2181
        dst+=dstStride;\
2182
        src+=srcStride;\
2183
    }\
2184
}
2185

    
2186
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2187
\
2188
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2189
    uint64_t temp[17*4];\
2190
    uint64_t *temp_ptr= temp;\
2191
    int count= 17;\
2192
\
2193
    /*FIXME unroll */\
2194
    asm volatile(\
2195
        "pxor %%mm7, %%mm7                \n\t"\
2196
        "1:                                \n\t"\
2197
        "movq (%0), %%mm0                \n\t"\
2198
        "movq (%0), %%mm1                \n\t"\
2199
        "movq 8(%0), %%mm2                \n\t"\
2200
        "movq 8(%0), %%mm3                \n\t"\
2201
        "punpcklbw %%mm7, %%mm0                \n\t"\
2202
        "punpckhbw %%mm7, %%mm1                \n\t"\
2203
        "punpcklbw %%mm7, %%mm2                \n\t"\
2204
        "punpckhbw %%mm7, %%mm3                \n\t"\
2205
        "movq %%mm0, (%1)                \n\t"\
2206
        "movq %%mm1, 17*8(%1)                \n\t"\
2207
        "movq %%mm2, 2*17*8(%1)                \n\t"\
2208
        "movq %%mm3, 3*17*8(%1)                \n\t"\
2209
        "add $8, %1                        \n\t"\
2210
        "add %3, %0                        \n\t"\
2211
        "decl %2                        \n\t"\
2212
        " jnz 1b                        \n\t"\
2213
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2214
        : "r" ((long)srcStride)\
2215
        : "memory"\
2216
    );\
2217
    \
2218
    temp_ptr= temp;\
2219
    count=4;\
2220
    \
2221
/*FIXME reorder for speed */\
2222
    asm volatile(\
2223
        /*"pxor %%mm7, %%mm7                \n\t"*/\
2224
        "1:                                \n\t"\
2225
        "movq (%0), %%mm0                \n\t"\
2226
        "movq 8(%0), %%mm1                \n\t"\
2227
        "movq 16(%0), %%mm2                \n\t"\
2228
        "movq 24(%0), %%mm3                \n\t"\
2229
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2230
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2231
        "add %4, %1                        \n\t"\
2232
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2233
        \
2234
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2235
        "add %4, %1                        \n\t"\
2236
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2237
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2238
        "add %4, %1                        \n\t"\
2239
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2240
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2241
        "add %4, %1                        \n\t"\
2242
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2243
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2244
        "add %4, %1                        \n\t"\
2245
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2246
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2247
        "add %4, %1                        \n\t"\
2248
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2249
        \
2250
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2251
        "add %4, %1                        \n\t"  \
2252
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2253
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2254
        \
2255
        "add $136, %0                        \n\t"\
2256
        "add %6, %1                        \n\t"\
2257
        "decl %2                        \n\t"\
2258
        " jnz 1b                        \n\t"\
2259
        \
2260
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2261
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2262
        :"memory"\
2263
    );\
2264
}\
2265
\
2266
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2267
    uint64_t temp[9*2];\
2268
    uint64_t *temp_ptr= temp;\
2269
    int count= 9;\
2270
\
2271
    /*FIXME unroll */\
2272
    asm volatile(\
2273
        "pxor %%mm7, %%mm7                \n\t"\
2274
        "1:                                \n\t"\
2275
        "movq (%0), %%mm0                \n\t"\
2276
        "movq (%0), %%mm1                \n\t"\
2277
        "punpcklbw %%mm7, %%mm0                \n\t"\
2278
        "punpckhbw %%mm7, %%mm1                \n\t"\
2279
        "movq %%mm0, (%1)                \n\t"\
2280
        "movq %%mm1, 9*8(%1)                \n\t"\
2281
        "add $8, %1                        \n\t"\
2282
        "add %3, %0                        \n\t"\
2283
        "decl %2                        \n\t"\
2284
        " jnz 1b                        \n\t"\
2285
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2286
        : "r" ((long)srcStride)\
2287
        : "memory"\
2288
    );\
2289
    \
2290
    temp_ptr= temp;\
2291
    count=2;\
2292
    \
2293
/*FIXME reorder for speed */\
2294
    asm volatile(\
2295
        /*"pxor %%mm7, %%mm7                \n\t"*/\
2296
        "1:                                \n\t"\
2297
        "movq (%0), %%mm0                \n\t"\
2298
        "movq 8(%0), %%mm1                \n\t"\
2299
        "movq 16(%0), %%mm2                \n\t"\
2300
        "movq 24(%0), %%mm3                \n\t"\
2301
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2302
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2303
        "add %4, %1                        \n\t"\
2304
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2305
        \
2306
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2307
        "add %4, %1                        \n\t"\
2308
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2309
        \
2310
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2311
        "add %4, %1                        \n\t"\
2312
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2313
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2314
                \
2315
        "add $72, %0                        \n\t"\
2316
        "add %6, %1                        \n\t"\
2317
        "decl %2                        \n\t"\
2318
        " jnz 1b                        \n\t"\
2319
         \
2320
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2321
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2322
        : "memory"\
2323
   );\
2324
}\
2325
\
2326
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2327
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2328
}\
2329
\
2330
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2331
    uint64_t temp[8];\
2332
    uint8_t * const half= (uint8_t*)temp;\
2333
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2334
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2335
}\
2336
\
2337
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2338
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2339
}\
2340
\
2341
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2342
    uint64_t temp[8];\
2343
    uint8_t * const half= (uint8_t*)temp;\
2344
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2345
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2346
}\
2347
\
2348
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2349
    uint64_t temp[8];\
2350
    uint8_t * const half= (uint8_t*)temp;\
2351
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2352
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2353
}\
2354
\
2355
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2356
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2357
}\
2358
\
2359
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2360
    uint64_t temp[8];\
2361
    uint8_t * const half= (uint8_t*)temp;\
2362
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2363
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2364
}\
2365
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2366
    uint64_t half[8 + 9];\
2367
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2368
    uint8_t * const halfHV= ((uint8_t*)half);\
2369
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2370
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2371
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2372
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2373
}\
2374
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2375
    uint64_t half[8 + 9];\
2376
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2377
    uint8_t * const halfHV= ((uint8_t*)half);\
2378
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2379
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2380
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2381
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2382
}\
2383
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2384
    uint64_t half[8 + 9];\
2385
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2386
    uint8_t * const halfHV= ((uint8_t*)half);\
2387
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2388
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2389
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2390
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2391
}\
2392
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2393
    uint64_t half[8 + 9];\
2394
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2395
    uint8_t * const halfHV= ((uint8_t*)half);\
2396
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2397
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2398
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2399
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2400
}\
2401
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2402
    uint64_t half[8 + 9];\
2403
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2404
    uint8_t * const halfHV= ((uint8_t*)half);\
2405
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2406
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2407
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2408
}\
2409
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2410
    uint64_t half[8 + 9];\
2411
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2412
    uint8_t * const halfHV= ((uint8_t*)half);\
2413
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2414
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2415
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2416
}\
2417
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2418
    uint64_t half[8 + 9];\
2419
    uint8_t * const halfH= ((uint8_t*)half);\
2420
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2421
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2422
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2423
}\
2424
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2425
    uint64_t half[8 + 9];\
2426
    uint8_t * const halfH= ((uint8_t*)half);\
2427
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2428
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2429
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2430
}\
2431
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2432
    uint64_t half[9];\
2433
    uint8_t * const halfH= ((uint8_t*)half);\
2434
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2435
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2436
}\
2437
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2438
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2439
}\
2440
\
2441
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2442
    uint64_t temp[32];\
2443
    uint8_t * const half= (uint8_t*)temp;\
2444
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2445
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2446
}\
2447
\
2448
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2449
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2450
}\
2451
\
2452
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2453
    uint64_t temp[32];\
2454
    uint8_t * const half= (uint8_t*)temp;\
2455
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2456
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2457
}\
2458
\
2459
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2460
    uint64_t temp[32];\
2461
    uint8_t * const half= (uint8_t*)temp;\
2462
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2463
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2464
}\
2465
\
2466
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2467
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2468
}\
2469
\
2470
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2471
    uint64_t temp[32];\
2472
    uint8_t * const half= (uint8_t*)temp;\
2473
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2474
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2475
}\
2476
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2477
    uint64_t half[16*2 + 17*2];\
2478
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2479
    uint8_t * const halfHV= ((uint8_t*)half);\
2480
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2481
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2482
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2483
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2484
}\
2485
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2486
    uint64_t half[16*2 + 17*2];\
2487
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2488
    uint8_t * const halfHV= ((uint8_t*)half);\
2489
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2490
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2491
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2492
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2493
}\
2494
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2495
    uint64_t half[16*2 + 17*2];\
2496
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2497
    uint8_t * const halfHV= ((uint8_t*)half);\
2498
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2499
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2500
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2501
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2502
}\
2503
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2504
    uint64_t half[16*2 + 17*2];\
2505
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2506
    uint8_t * const halfHV= ((uint8_t*)half);\
2507
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2508
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2509
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2510
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2511
}\
2512
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2513
    uint64_t half[16*2 + 17*2];\
2514
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2515
    uint8_t * const halfHV= ((uint8_t*)half);\
2516
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2517
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2518
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2519
}\
2520
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2521
    uint64_t half[16*2 + 17*2];\
2522
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2523
    uint8_t * const halfHV= ((uint8_t*)half);\
2524
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2525
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2526
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2527
}\
2528
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2529
    uint64_t half[17*2];\
2530
    uint8_t * const halfH= ((uint8_t*)half);\
2531
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2532
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2533
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2534
}\
2535
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2536
    uint64_t half[17*2];\
2537
    uint8_t * const halfH= ((uint8_t*)half);\
2538
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2539
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2540
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2541
}\
2542
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2543
    uint64_t half[17*2];\
2544
    uint8_t * const halfH= ((uint8_t*)half);\
2545
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2546
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2547
}
2548

    
2549
#define QPEL_H264V(A,B,C,D,E,F,OP)\
2550
        "movd (%0), "#F"                \n\t"\
2551
        "movq "#C", %%mm6                \n\t"\
2552
        "paddw "#D", %%mm6                \n\t"\
2553
        "psllw $2, %%mm6                \n\t"\
2554
        "psubw "#B", %%mm6                \n\t"\
2555
        "psubw "#E", %%mm6                \n\t"\
2556
        "pmullw %4, %%mm6                \n\t"\
2557
        "add %2, %0                        \n\t"\
2558
        "punpcklbw %%mm7, "#F"                \n\t"\
2559
        "paddw %5, "#A"                        \n\t"\
2560
        "paddw "#F", "#A"                \n\t"\
2561
        "paddw "#A", %%mm6                \n\t"\
2562
        "psraw $5, %%mm6                \n\t"\
2563
        "packuswb %%mm6, %%mm6                \n\t"\
2564
        OP(%%mm6, (%1), A, d)\
2565
        "add %3, %1                        \n\t"     
2566

    
2567
#define QPEL_H264HV(A,B,C,D,E,F,OF)\
2568
        "movd (%0), "#F"                \n\t"\
2569
        "movq "#C", %%mm6                \n\t"\
2570
        "paddw "#D", %%mm6                \n\t"\
2571
        "psllw $2, %%mm6                \n\t"\
2572
        "psubw "#B", %%mm6                \n\t"\
2573
        "psubw "#E", %%mm6                \n\t"\
2574
        "pmullw %3, %%mm6                \n\t"\
2575
        "add %2, %0                        \n\t"\
2576
        "punpcklbw %%mm7, "#F"                \n\t"\
2577
        "paddw "#F", "#A"                \n\t"\
2578
        "paddw "#A", %%mm6                \n\t"\
2579
        "movq %%mm6, "#OF"(%1)                \n\t"
2580
        
2581
#define QPEL_H264(OPNAME, OP, MMX)\
2582
static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2583
    int h=4;\
2584
\
2585
    asm volatile(\
2586
        "pxor %%mm7, %%mm7                \n\t"\
2587
        "movq %5, %%mm4                        \n\t"\
2588
        "movq %6, %%mm5                        \n\t"\
2589
        "1:                                \n\t"\
2590
        "movd  -1(%0), %%mm1                \n\t"\
2591
        "movd    (%0), %%mm2                \n\t"\
2592
        "movd   1(%0), %%mm3                \n\t"\
2593
        "movd   2(%0), %%mm0                \n\t"\
2594
        "punpcklbw %%mm7, %%mm1                \n\t"\
2595
        "punpcklbw %%mm7, %%mm2                \n\t"\
2596
        "punpcklbw %%mm7, %%mm3                \n\t"\
2597
        "punpcklbw %%mm7, %%mm0                \n\t"\
2598
        "paddw %%mm0, %%mm1                \n\t"\
2599
        "paddw %%mm3, %%mm2                \n\t"\
2600
        "movd  -2(%0), %%mm0                \n\t"\
2601
        "movd   3(%0), %%mm3                \n\t"\
2602
        "punpcklbw %%mm7, %%mm0                \n\t"\
2603
        "punpcklbw %%mm7, %%mm3                \n\t"\
2604
        "paddw %%mm3, %%mm0                \n\t"\
2605
        "psllw $2, %%mm2                \n\t"\
2606
        "psubw %%mm1, %%mm2                \n\t"\
2607
        "pmullw %%mm4, %%mm2                \n\t"\
2608
        "paddw %%mm5, %%mm0                \n\t"\
2609
        "paddw %%mm2, %%mm0                \n\t"\
2610
        "psraw $5, %%mm0                \n\t"\
2611
        "packuswb %%mm0, %%mm0                \n\t"\
2612
        OP(%%mm0, (%1),%%mm6, d)\
2613
        "add %3, %0                        \n\t"\
2614
        "add %4, %1                        \n\t"\
2615
        "decl %2                        \n\t"\
2616
        " jnz 1b                        \n\t"\
2617
        : "+a"(src), "+c"(dst), "+m"(h)\
2618
        : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2619
        : "memory"\
2620
    );\
2621
}\
2622
static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2623
    src -= 2*srcStride;\
2624
    asm volatile(\
2625
        "pxor %%mm7, %%mm7                \n\t"\
2626
        "movd (%0), %%mm0                \n\t"\
2627
        "add %2, %0                        \n\t"\
2628
        "movd (%0), %%mm1                \n\t"\
2629
        "add %2, %0                        \n\t"\
2630
        "movd (%0), %%mm2                \n\t"\
2631
        "add %2, %0                        \n\t"\
2632
        "movd (%0), %%mm3                \n\t"\
2633
        "add %2, %0                        \n\t"\
2634
        "movd (%0), %%mm4                \n\t"\
2635
        "add %2, %0                        \n\t"\
2636
        "punpcklbw %%mm7, %%mm0                \n\t"\
2637
        "punpcklbw %%mm7, %%mm1                \n\t"\
2638
        "punpcklbw %%mm7, %%mm2                \n\t"\
2639
        "punpcklbw %%mm7, %%mm3                \n\t"\
2640
        "punpcklbw %%mm7, %%mm4                \n\t"\
2641
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2642
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2643
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
2644
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
2645
         \
2646
        : "+a"(src), "+c"(dst)\
2647
        : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2648
        : "memory"\
2649
    );\
2650
}\
2651
static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2652
    int h=4;\
2653
    int w=3;\
2654
    src -= 2*srcStride+2;\
2655
    while(w--){\
2656
        asm volatile(\
2657
            "pxor %%mm7, %%mm7                        \n\t"\
2658
            "movd (%0), %%mm0                        \n\t"\
2659
            "add %2, %0                                \n\t"\
2660
            "movd (%0), %%mm1                        \n\t"\
2661
            "add %2, %0                                \n\t"\
2662
            "movd (%0), %%mm2                        \n\t"\
2663
            "add %2, %0                                \n\t"\
2664
            "movd (%0), %%mm3                        \n\t"\
2665
            "add %2, %0                                \n\t"\
2666
            "movd (%0), %%mm4                        \n\t"\
2667
            "add %2, %0                                \n\t"\
2668
            "punpcklbw %%mm7, %%mm0                \n\t"\
2669
            "punpcklbw %%mm7, %%mm1                \n\t"\
2670
            "punpcklbw %%mm7, %%mm2                \n\t"\
2671
            "punpcklbw %%mm7, %%mm3                \n\t"\
2672
            "punpcklbw %%mm7, %%mm4                \n\t"\
2673
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
2674
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
2675
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
2676
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
2677
             \
2678
            : "+a"(src)\
2679
            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
2680
            : "memory"\
2681
        );\
2682
        tmp += 4;\
2683
        src += 4 - 9*srcStride;\
2684
    }\
2685
    tmp -= 3*4;\
2686
    asm volatile(\
2687
        "movq %4, %%mm6                        \n\t"\
2688
        "1:                                \n\t"\
2689
        "movq     (%0), %%mm0                \n\t"\
2690
        "paddw  10(%0), %%mm0                \n\t"\
2691
        "movq    2(%0), %%mm1                \n\t"\
2692
        "paddw   8(%0), %%mm1                \n\t"\
2693
        "movq    4(%0), %%mm2                \n\t"\
2694
        "paddw   6(%0), %%mm2                \n\t"\
2695
        "psubw %%mm1, %%mm0                \n\t"/*a-b   (abccba)*/\
2696
        "psraw $2, %%mm0                \n\t"/*(a-b)/4 */\
2697
        "psubw %%mm1, %%mm0                \n\t"/*(a-b)/4-b */\
2698
        "paddsw %%mm2, %%mm0                \n\t"\
2699
        "psraw $2, %%mm0                \n\t"/*((a-b)/4-b)/4 */\
2700
        "paddw %%mm6, %%mm2                \n\t"\
2701
        "paddw %%mm2, %%mm0                \n\t"\
2702
        "psraw $6, %%mm0                \n\t"\
2703
        "packuswb %%mm0, %%mm0                \n\t"\
2704
        OP(%%mm0, (%1),%%mm7, d)\
2705
        "add $24, %0                        \n\t"\
2706
        "add %3, %1                        \n\t"\
2707
        "decl %2                        \n\t"\
2708
        " jnz 1b                        \n\t"\
2709
        : "+a"(tmp), "+c"(dst), "+m"(h)\
2710
        : "S"((long)dstStride), "m"(ff_pw_32)\
2711
        : "memory"\
2712
    );\
2713
}\
2714
\
2715
static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2716
    int h=8;\
2717
    asm volatile(\
2718
        "pxor %%mm7, %%mm7                \n\t"\
2719
        "movq %5, %%mm6                        \n\t"\
2720
        "1:                                \n\t"\
2721
        "movq    (%0), %%mm0                \n\t"\
2722
        "movq   1(%0), %%mm2                \n\t"\
2723
        "movq %%mm0, %%mm1                \n\t"\
2724
        "movq %%mm2, %%mm3                \n\t"\
2725
        "punpcklbw %%mm7, %%mm0                \n\t"\
2726
        "punpckhbw %%mm7, %%mm1                \n\t"\
2727
        "punpcklbw %%mm7, %%mm2                \n\t"\
2728
        "punpckhbw %%mm7, %%mm3                \n\t"\
2729
        "paddw %%mm2, %%mm0                \n\t"\
2730
        "paddw %%mm3, %%mm1                \n\t"\
2731
        "psllw $2, %%mm0                \n\t"\
2732
        "psllw $2, %%mm1                \n\t"\
2733
        "movq   -1(%0), %%mm2                \n\t"\
2734
        "movq    2(%0), %%mm4                \n\t"\
2735
        "movq %%mm2, %%mm3                \n\t"\
2736
        "movq %%mm4, %%mm5                \n\t"\
2737
        "punpcklbw %%mm7, %%mm2                \n\t"\
2738
        "punpckhbw %%mm7, %%mm3                \n\t"\
2739
        "punpcklbw %%mm7, %%mm4                \n\t"\
2740
        "punpckhbw %%mm7, %%mm5                \n\t"\
2741
        "paddw %%mm4, %%mm2                \n\t"\
2742
        "paddw %%mm3, %%mm5                \n\t"\
2743
        "psubw %%mm2, %%mm0                \n\t"\
2744
        "psubw %%mm5, %%mm1                \n\t"\
2745
        "pmullw %%mm6, %%mm0                \n\t"\
2746
        "pmullw %%mm6, %%mm1                \n\t"\
2747
        "movd   -2(%0), %%mm2                \n\t"\
2748
        "movd    7(%0), %%mm5                \n\t"\
2749
        "punpcklbw %%mm7, %%mm2                \n\t"\
2750
        "punpcklbw %%mm7, %%mm5                \n\t"\
2751
        "paddw %%mm3, %%mm2                \n\t"\
2752
        "paddw %%mm5, %%mm4                \n\t"\
2753
        "movq %6, %%mm5                        \n\t"\
2754
        "paddw %%mm5, %%mm2                \n\t"\
2755
        "paddw %%mm5, %%mm4                \n\t"\
2756
        "paddw %%mm2, %%mm0                \n\t"\
2757
        "paddw %%mm4, %%mm1                \n\t"\
2758
        "psraw $5, %%mm0                \n\t"\
2759
        "psraw $5, %%mm1                \n\t"\
2760
        "packuswb %%mm1, %%mm0                \n\t"\
2761
        OP(%%mm0, (%1),%%mm5, q)\
2762
        "add %3, %0                        \n\t"\
2763
        "add %4, %1                        \n\t"\
2764
        "decl %2                        \n\t"\
2765
        " jnz 1b                        \n\t"\
2766
        : "+a"(src), "+c"(dst), "+m"(h)\
2767
        : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2768
        : "memory"\
2769
    );\
2770
}\
2771
\
2772
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2773
    int h= 2;\
2774
    src -= 2*srcStride;\
2775
    \
2776
    while(h--){\
2777
      asm volatile(\
2778
        "pxor %%mm7, %%mm7                \n\t"\
2779
        "movd (%0), %%mm0                \n\t"\
2780
        "add %2, %0                        \n\t"\
2781
        "movd (%0), %%mm1                \n\t"\
2782
        "add %2, %0                        \n\t"\
2783
        "movd (%0), %%mm2                \n\t"\
2784
        "add %2, %0                        \n\t"\
2785
        "movd (%0), %%mm3                \n\t"\
2786
        "add %2, %0                        \n\t"\
2787
        "movd (%0), %%mm4                \n\t"\
2788
        "add %2, %0                        \n\t"\
2789
        "punpcklbw %%mm7, %%mm0                \n\t"\
2790
        "punpcklbw %%mm7, %%mm1                \n\t"\
2791
        "punpcklbw %%mm7, %%mm2                \n\t"\
2792
        "punpcklbw %%mm7, %%mm3                \n\t"\
2793
        "punpcklbw %%mm7, %%mm4                \n\t"\
2794
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2795
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2796
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
2797
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
2798
        QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
2799
        QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
2800
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2801
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2802
         \
2803
        : "+a"(src), "+c"(dst)\
2804
        : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2805
        : "memory"\
2806
     );\
2807
     src += 4-13*srcStride;\
2808
     dst +=  4-8*dstStride;\
2809
   }\
2810
}\
2811
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2812
    int h=8;\
2813
    int w=4;\
2814
    src -= 2*srcStride+2;\
2815
    while(w--){\
2816
        asm volatile(\
2817
            "pxor %%mm7, %%mm7                        \n\t"\
2818
            "movd (%0), %%mm0                        \n\t"\
2819
            "add %2, %0                                \n\t"\
2820
            "movd (%0), %%mm1                        \n\t"\
2821
            "add %2, %0                                \n\t"\
2822
            "movd (%0), %%mm2                        \n\t"\
2823
            "add %2, %0                                \n\t"\
2824
            "movd (%0), %%mm3                        \n\t"\
2825
            "add %2, %0                                \n\t"\
2826
            "movd (%0), %%mm4                        \n\t"\
2827
            "add %2, %0                                \n\t"\
2828
            "punpcklbw %%mm7, %%mm0                \n\t"\
2829
            "punpcklbw %%mm7, %%mm1                \n\t"\
2830
            "punpcklbw %%mm7, %%mm2                \n\t"\
2831
            "punpcklbw %%mm7, %%mm3                \n\t"\
2832
            "punpcklbw %%mm7, %%mm4                \n\t"\
2833
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
2834
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
2835
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
2836
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
2837
            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
2838
            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
2839
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
2840
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
2841
             \
2842
            : "+a"(src)\
2843
            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
2844
            : "memory"\
2845
        );\
2846
        tmp += 4;\
2847
        src += 4 - 13*srcStride;\
2848
    }\
2849
    tmp -= 4*4;\
2850
    asm volatile(\
2851
        "movq %4, %%mm6                        \n\t"\
2852
        "1:                                \n\t"\
2853
        "movq     (%0), %%mm0                \n\t"\
2854
        "movq    8(%0), %%mm3                \n\t"\
2855
        "movq    2(%0), %%mm1                \n\t"\
2856
        "movq   10(%0), %%mm4                \n\t"\
2857
        "paddw   %%mm4, %%mm0                \n\t"\
2858
        "paddw   %%mm3, %%mm1                \n\t"\
2859
        "paddw  18(%0), %%mm3                \n\t"\
2860
        "paddw  16(%0), %%mm4                \n\t"\
2861
        "movq    4(%0), %%mm2                \n\t"\
2862
        "movq   12(%0), %%mm5                \n\t"\
2863
        "paddw   6(%0), %%mm2                \n\t"\
2864
        "paddw  14(%0), %%mm5                \n\t"\
2865
        "psubw %%mm1, %%mm0                \n\t"\
2866
        "psubw %%mm4, %%mm3                \n\t"\
2867
        "psraw $2, %%mm0                \n\t"\
2868
        "psraw $2, %%mm3                \n\t"\
2869
        "psubw %%mm1, %%mm0                \n\t"\
2870
        "psubw %%mm4, %%mm3                \n\t"\
2871
        "paddsw %%mm2, %%mm0                \n\t"\
2872
        "paddsw %%mm5, %%mm3                \n\t"\
2873
        "psraw $2, %%mm0                \n\t"\
2874
        "psraw $2, %%mm3                \n\t"\
2875
        "paddw %%mm6, %%mm2                \n\t"\
2876
        "paddw %%mm6, %%mm5                \n\t"\
2877
        "paddw %%mm2, %%mm0                \n\t"\
2878
        "paddw %%mm5, %%mm3                \n\t"\
2879
        "psraw $6, %%mm0                \n\t"\
2880
        "psraw $6, %%mm3                \n\t"\
2881
        "packuswb %%mm3, %%mm0                \n\t"\
2882
        OP(%%mm0, (%1),%%mm7, q)\
2883
        "add $32, %0                        \n\t"\
2884
        "add %3, %1                        \n\t"\
2885
        "decl %2                        \n\t"\
2886
        " jnz 1b                        \n\t"\
2887
        : "+a"(tmp), "+c"(dst), "+m"(h)\
2888
        : "S"((long)dstStride), "m"(ff_pw_32)\
2889
        : "memory"\
2890
    );\
2891
}\
2892
static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2893
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2894
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2895
    src += 8*srcStride;\
2896
    dst += 8*dstStride;\
2897
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2898
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2899
}\
2900
\
2901
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2902
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2903
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2904
    src += 8*srcStride;\
2905
    dst += 8*dstStride;\
2906
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2907
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2908
}\
2909
\
2910
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2911
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2912
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
2913
    src += 8*srcStride;\
2914
    dst += 8*dstStride;\
2915
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2916
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
2917
}\
2918

    
2919
#define H264_MC(OPNAME, SIZE, MMX) \
2920
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2921
    OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
2922
}\
2923
\
2924
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2925
    uint64_t temp[SIZE*SIZE/8];\
2926
    uint8_t * const half= (uint8_t*)temp;\
2927
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
2928
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
2929
}\
2930
\
2931
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2932
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
2933
}\
2934
\
2935
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2936
    uint64_t temp[SIZE*SIZE/8];\
2937
    uint8_t * const half= (uint8_t*)temp;\
2938
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
2939
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\
2940
}\
2941
\
2942
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2943
    uint64_t temp[SIZE*SIZE/8];\
2944
    uint8_t * const half= (uint8_t*)temp;\
2945
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
2946
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
2947
}\
2948
\
2949
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2950
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
2951
}\
2952
\
2953
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2954
    uint64_t temp[SIZE*SIZE/8];\
2955
    uint8_t * const half= (uint8_t*)temp;\
2956
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
2957
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
2958
}\
2959
\
2960
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2961
    uint64_t temp[SIZE*SIZE/4];\
2962
    uint8_t * const halfH= (uint8_t*)temp;\
2963
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2964
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2965
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2966
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2967
}\
2968
\
2969
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2970
    uint64_t temp[SIZE*SIZE/4];\
2971
    uint8_t * const halfH= (uint8_t*)temp;\
2972
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2973
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2974
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2975
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2976
}\
2977
\
2978
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2979
    uint64_t temp[SIZE*SIZE/4];\
2980
    uint8_t * const halfH= (uint8_t*)temp;\
2981
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2982
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2983
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2984
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2985
}\
2986
\
2987
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2988
    uint64_t temp[SIZE*SIZE/4];\
2989
    uint8_t * const halfH= (uint8_t*)temp;\
2990
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2991
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2992
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2993
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2994
}\
2995
\
2996
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2997
    uint64_t temp[SIZE*(SIZE+8)/4];\
2998
    int16_t * const tmp= (int16_t*)temp;\
2999
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
3000
}\
3001
\
3002
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3003
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
3004
    uint8_t * const halfH= (uint8_t*)temp;\
3005
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
3006
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
3007
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
3008
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
3009
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
3010
}\
3011
\
3012
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3013
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
3014
    uint8_t * const halfH= (uint8_t*)temp;\
3015
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
3016
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
3017
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
3018
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
3019
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
3020
}\
3021
\
3022
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3023
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
3024
    uint8_t * const halfV= (uint8_t*)temp;\
3025
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
3026
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
3027
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
3028
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
3029
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
3030
}\
3031
\
3032
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3033
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
3034
    uint8_t * const halfV= (uint8_t*)temp;\
3035
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
3036
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
3037
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
3038
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
3039
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
3040
}\
3041

    
3042

    
3043
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
3044
#define AVG_3DNOW_OP(a,b,temp, size) \
3045
"mov" #size " " #b ", " #temp "        \n\t"\
3046
"pavgusb " #temp ", " #a "        \n\t"\
3047
"mov" #size " " #a ", " #b "        \n\t"
3048
#define AVG_MMX2_OP(a,b,temp, size) \
3049
"mov" #size " " #b ", " #temp "        \n\t"\
3050
"pavgb " #temp ", " #a "        \n\t"\
3051
"mov" #size " " #a ", " #b "        \n\t"
3052

    
3053
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
3054
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
3055
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3056
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
3057
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
3058
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
3059
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
3060
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
3061
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
3062

    
3063
QPEL_H264(put_       ,       PUT_OP, 3dnow)
3064
QPEL_H264(avg_       , AVG_3DNOW_OP, 3dnow)
3065
QPEL_H264(put_       ,       PUT_OP, mmx2)
3066
QPEL_H264(avg_       ,  AVG_MMX2_OP, mmx2)
3067

    
3068
H264_MC(put_, 4, 3dnow)
3069
H264_MC(put_, 8, 3dnow)
3070
H264_MC(put_, 16,3dnow)
3071
H264_MC(avg_, 4, 3dnow)
3072
H264_MC(avg_, 8, 3dnow)
3073
H264_MC(avg_, 16,3dnow)
3074
H264_MC(put_, 4, mmx2)
3075
H264_MC(put_, 8, mmx2)
3076
H264_MC(put_, 16,mmx2)
3077
H264_MC(avg_, 4, mmx2)
3078
H264_MC(avg_, 8, mmx2)
3079
H264_MC(avg_, 16,mmx2)
3080

    
3081
#if 0
3082
static void just_return() { return; }
3083
#endif
3084

    
3085
#define SET_QPEL_FUNC(postfix1, postfix2) \
3086
    c->put_ ## postfix1 = put_ ## postfix2;\
3087
    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
3088
    c->avg_ ## postfix1 = avg_ ## postfix2;
3089

    
3090
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3091
    long i=0;
3092
    
3093
    assert(ABS(scale) < 256);
3094
    scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
3095

    
3096
    asm volatile(
3097
        "pcmpeqw %%mm6, %%mm6                \n\t" // -1w
3098
        "psrlw $15, %%mm6                \n\t" //  1w
3099
        "pxor %%mm7, %%mm7                \n\t"
3100
        "movd  %4, %%mm5                \n\t&