Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ c998bdd9

History | View | Annotate | Download (126 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
21
 */
22

    
23
#include "../dsputil.h"
24
#include "../simple_idct.h"
25
#include "../mpegvideo.h"
26
#include "mmx.h"
27

    
28
//#undef NDEBUG
29
//#include <assert.h>
30

    
31
extern const uint8_t ff_h263_loop_filter_strength[32];
32

    
33
int mm_flags; /* multimedia extension flags */
34

    
35
/* pixel operations */
36
static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
37
static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
38
static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
39

    
40
static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
41
static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
42
static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
43
static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
44
static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
45
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
46
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
47

    
48
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
49

    
50
#define JUMPALIGN() __asm __volatile (".balign 8"::)
51
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
52

    
53
#define MOVQ_WONE(regd) \
54
    __asm __volatile ( \
55
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
56
    "psrlw $15, %%" #regd ::)
57

    
58
#define MOVQ_BFE(regd) \
59
    __asm __volatile ( \
60
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
61
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
62

    
63
#ifndef PIC
64
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
65
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
66
#else
67
// for shared library it's better to use this way for accessing constants
68
// pcmpeqd -> -1
69
#define MOVQ_BONE(regd) \
70
    __asm __volatile ( \
71
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
72
    "psrlw $15, %%" #regd " \n\t" \
73
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
74

    
75
#define MOVQ_WTWO(regd) \
76
    __asm __volatile ( \
77
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78
    "psrlw $15, %%" #regd " \n\t" \
79
    "psllw $1, %%" #regd " \n\t"::)
80

    
81
#endif
82

    
83
// using regr as temporary and for the output result
84
// first argument is unmodifed and second is trashed
85
// regfe is supposed to contain 0xfefefefefefefefe
86
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
87
    "movq " #rega ", " #regr "        \n\t"\
88
    "pand " #regb ", " #regr "        \n\t"\
89
    "pxor " #rega ", " #regb "        \n\t"\
90
    "pand " #regfe "," #regb "        \n\t"\
91
    "psrlq $1, " #regb "         \n\t"\
92
    "paddb " #regb ", " #regr "        \n\t"
93

    
94
#define PAVGB_MMX(rega, regb, regr, regfe) \
95
    "movq " #rega ", " #regr "        \n\t"\
96
    "por  " #regb ", " #regr "        \n\t"\
97
    "pxor " #rega ", " #regb "        \n\t"\
98
    "pand " #regfe "," #regb "        \n\t"\
99
    "psrlq $1, " #regb "        \n\t"\
100
    "psubb " #regb ", " #regr "        \n\t"
101

    
102
// mm6 is supposed to contain 0xfefefefefefefefe
103
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
104
    "movq " #rega ", " #regr "        \n\t"\
105
    "movq " #regc ", " #regp "        \n\t"\
106
    "pand " #regb ", " #regr "        \n\t"\
107
    "pand " #regd ", " #regp "        \n\t"\
108
    "pxor " #rega ", " #regb "        \n\t"\
109
    "pxor " #regc ", " #regd "        \n\t"\
110
    "pand %%mm6, " #regb "        \n\t"\
111
    "pand %%mm6, " #regd "        \n\t"\
112
    "psrlq $1, " #regb "         \n\t"\
113
    "psrlq $1, " #regd "         \n\t"\
114
    "paddb " #regb ", " #regr "        \n\t"\
115
    "paddb " #regd ", " #regp "        \n\t"
116

    
117
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
118
    "movq " #rega ", " #regr "        \n\t"\
119
    "movq " #regc ", " #regp "        \n\t"\
120
    "por  " #regb ", " #regr "        \n\t"\
121
    "por  " #regd ", " #regp "        \n\t"\
122
    "pxor " #rega ", " #regb "        \n\t"\
123
    "pxor " #regc ", " #regd "        \n\t"\
124
    "pand %%mm6, " #regb "             \n\t"\
125
    "pand %%mm6, " #regd "             \n\t"\
126
    "psrlq $1, " #regd "        \n\t"\
127
    "psrlq $1, " #regb "        \n\t"\
128
    "psubb " #regb ", " #regr "        \n\t"\
129
    "psubb " #regd ", " #regp "        \n\t"
130

    
131
/***********************************/
132
/* MMX no rounding */
133
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
134
#define SET_RND  MOVQ_WONE
135
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
136
#define PAVGB(a, b, c, e)                PAVGB_MMX_NO_RND(a, b, c, e)
137

    
138
#include "dsputil_mmx_rnd.h"
139

    
140
#undef DEF
141
#undef SET_RND
142
#undef PAVGBP
143
#undef PAVGB
144
/***********************************/
145
/* MMX rounding */
146

    
147
#define DEF(x, y) x ## _ ## y ##_mmx
148
#define SET_RND  MOVQ_WTWO
149
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
150
#define PAVGB(a, b, c, e)                PAVGB_MMX(a, b, c, e)
151

    
152
#include "dsputil_mmx_rnd.h"
153

    
154
#undef DEF
155
#undef SET_RND
156
#undef PAVGBP
157
#undef PAVGB
158

    
159
/***********************************/
160
/* 3Dnow specific */
161

    
162
#define DEF(x) x ## _3dnow
163
/* for Athlons PAVGUSB is prefered */
164
#define PAVGB "pavgusb"
165

    
166
#include "dsputil_mmx_avg.h"
167

    
168
#undef DEF
169
#undef PAVGB
170

    
171
/***********************************/
172
/* MMX2 specific */
173

    
174
#define DEF(x) x ## _mmx2
175

    
176
/* Introduced only in MMX2 set */
177
#define PAVGB "pavgb"
178

    
179
#include "dsputil_mmx_avg.h"
180

    
181
#undef DEF
182
#undef PAVGB
183

    
184
/***********************************/
185
/* standard MMX */
186

    
187
#ifdef CONFIG_ENCODERS
188
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
189
{
190
    asm volatile(
191
        "mov $-128, %%"REG_a"        \n\t"
192
        "pxor %%mm7, %%mm7        \n\t"
193
        ".balign 16                \n\t"
194
        "1:                        \n\t"
195
        "movq (%0), %%mm0        \n\t"
196
        "movq (%0, %2), %%mm2        \n\t"
197
        "movq %%mm0, %%mm1        \n\t"
198
        "movq %%mm2, %%mm3        \n\t"
199
        "punpcklbw %%mm7, %%mm0        \n\t"
200
        "punpckhbw %%mm7, %%mm1        \n\t"
201
        "punpcklbw %%mm7, %%mm2        \n\t"
202
        "punpckhbw %%mm7, %%mm3        \n\t"
203
        "movq %%mm0, (%1, %%"REG_a")\n\t"
204
        "movq %%mm1, 8(%1, %%"REG_a")\n\t"
205
        "movq %%mm2, 16(%1, %%"REG_a")\n\t"
206
        "movq %%mm3, 24(%1, %%"REG_a")\n\t"
207
        "add %3, %0                \n\t"
208
        "add $32, %%"REG_a"        \n\t"
209
        "js 1b                        \n\t"
210
        : "+r" (pixels)
211
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
212
        : "%"REG_a
213
    );
214
}
215

    
216
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
217
{
218
    asm volatile(
219
        "pxor %%mm7, %%mm7        \n\t"
220
        "mov $-128, %%"REG_a"        \n\t"
221
        ".balign 16                \n\t"
222
        "1:                        \n\t"
223
        "movq (%0), %%mm0        \n\t"
224
        "movq (%1), %%mm2        \n\t"
225
        "movq %%mm0, %%mm1        \n\t"
226
        "movq %%mm2, %%mm3        \n\t"
227
        "punpcklbw %%mm7, %%mm0        \n\t"
228
        "punpckhbw %%mm7, %%mm1        \n\t"
229
        "punpcklbw %%mm7, %%mm2        \n\t"
230
        "punpckhbw %%mm7, %%mm3        \n\t"
231
        "psubw %%mm2, %%mm0        \n\t"
232
        "psubw %%mm3, %%mm1        \n\t"
233
        "movq %%mm0, (%2, %%"REG_a")\n\t"
234
        "movq %%mm1, 8(%2, %%"REG_a")\n\t"
235
        "add %3, %0                \n\t"
236
        "add %3, %1                \n\t"
237
        "add $16, %%"REG_a"        \n\t"
238
        "jnz 1b                        \n\t"
239
        : "+r" (s1), "+r" (s2)
240
        : "r" (block+64), "r" ((long)stride)
241
        : "%"REG_a
242
    );
243
}
244
#endif //CONFIG_ENCODERS
245

    
246
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
247
{
248
    const DCTELEM *p;
249
    uint8_t *pix;
250

    
251
    /* read the pixels */
252
    p = block;
253
    pix = pixels;
254
    /* unrolled loop */
255
        __asm __volatile(
256
                "movq        %3, %%mm0\n\t"
257
                "movq        8%3, %%mm1\n\t"
258
                "movq        16%3, %%mm2\n\t"
259
                "movq        24%3, %%mm3\n\t"
260
                "movq        32%3, %%mm4\n\t"
261
                "movq        40%3, %%mm5\n\t"
262
                "movq        48%3, %%mm6\n\t"
263
                "movq        56%3, %%mm7\n\t"
264
                "packuswb %%mm1, %%mm0\n\t"
265
                "packuswb %%mm3, %%mm2\n\t"
266
                "packuswb %%mm5, %%mm4\n\t"
267
                "packuswb %%mm7, %%mm6\n\t"
268
                "movq        %%mm0, (%0)\n\t"
269
                "movq        %%mm2, (%0, %1)\n\t"
270
                "movq        %%mm4, (%0, %1, 2)\n\t"
271
                "movq        %%mm6, (%0, %2)\n\t"
272
                ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
273
                :"memory");
274
        pix += line_size*4;
275
        p += 32;
276

    
277
    // if here would be an exact copy of the code above
278
    // compiler would generate some very strange code
279
    // thus using "r"
280
    __asm __volatile(
281
            "movq        (%3), %%mm0\n\t"
282
            "movq        8(%3), %%mm1\n\t"
283
            "movq        16(%3), %%mm2\n\t"
284
            "movq        24(%3), %%mm3\n\t"
285
            "movq        32(%3), %%mm4\n\t"
286
            "movq        40(%3), %%mm5\n\t"
287
            "movq        48(%3), %%mm6\n\t"
288
            "movq        56(%3), %%mm7\n\t"
289
            "packuswb %%mm1, %%mm0\n\t"
290
            "packuswb %%mm3, %%mm2\n\t"
291
            "packuswb %%mm5, %%mm4\n\t"
292
            "packuswb %%mm7, %%mm6\n\t"
293
            "movq        %%mm0, (%0)\n\t"
294
            "movq        %%mm2, (%0, %1)\n\t"
295
            "movq        %%mm4, (%0, %1, 2)\n\t"
296
            "movq        %%mm6, (%0, %2)\n\t"
297
            ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
298
            :"memory");
299
}
300

    
301
static unsigned char __align8 vector128[8] =
302
  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
303

    
304
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
305
{
306
    int i;
307

    
308
    movq_m2r(*vector128, mm1);
309
    for (i = 0; i < 8; i++) {
310
        movq_m2r(*(block), mm0);
311
        packsswb_m2r(*(block + 4), mm0);
312
        block += 8;
313
        paddb_r2r(mm1, mm0);
314
        movq_r2m(mm0, *pixels);
315
        pixels += line_size;
316
    }
317
}
318

    
319
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
320
{
321
    const DCTELEM *p;
322
    uint8_t *pix;
323
    int i;
324

    
325
    /* read the pixels */
326
    p = block;
327
    pix = pixels;
328
    MOVQ_ZERO(mm7);
329
    i = 4;
330
    do {
331
        __asm __volatile(
332
                "movq        (%2), %%mm0\n\t"
333
                "movq        8(%2), %%mm1\n\t"
334
                "movq        16(%2), %%mm2\n\t"
335
                "movq        24(%2), %%mm3\n\t"
336
                "movq        %0, %%mm4\n\t"
337
                "movq        %1, %%mm6\n\t"
338
                "movq        %%mm4, %%mm5\n\t"
339
                "punpcklbw %%mm7, %%mm4\n\t"
340
                "punpckhbw %%mm7, %%mm5\n\t"
341
                "paddsw        %%mm4, %%mm0\n\t"
342
                "paddsw        %%mm5, %%mm1\n\t"
343
                "movq        %%mm6, %%mm5\n\t"
344
                "punpcklbw %%mm7, %%mm6\n\t"
345
                "punpckhbw %%mm7, %%mm5\n\t"
346
                "paddsw        %%mm6, %%mm2\n\t"
347
                "paddsw        %%mm5, %%mm3\n\t"
348
                "packuswb %%mm1, %%mm0\n\t"
349
                "packuswb %%mm3, %%mm2\n\t"
350
                "movq        %%mm0, %0\n\t"
351
                "movq        %%mm2, %1\n\t"
352
                :"+m"(*pix), "+m"(*(pix+line_size))
353
                :"r"(p)
354
                :"memory");
355
        pix += line_size*2;
356
        p += 16;
357
    } while (--i);
358
}
359

    
360
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
361
{
362
    __asm __volatile(
363
         "lea (%3, %3), %%"REG_a"        \n\t"
364
         ".balign 8                        \n\t"
365
         "1:                                \n\t"
366
         "movd (%1), %%mm0                \n\t"
367
         "movd (%1, %3), %%mm1                \n\t"
368
         "movd %%mm0, (%2)                \n\t"
369
         "movd %%mm1, (%2, %3)                \n\t"
370
         "add %%"REG_a", %1                \n\t"
371
         "add %%"REG_a", %2                \n\t"
372
         "movd (%1), %%mm0                \n\t"
373
         "movd (%1, %3), %%mm1                \n\t"
374
         "movd %%mm0, (%2)                \n\t"
375
         "movd %%mm1, (%2, %3)                \n\t"
376
         "add %%"REG_a", %1                \n\t"
377
         "add %%"REG_a", %2                \n\t"
378
         "subl $4, %0                        \n\t"
379
         "jnz 1b                        \n\t"
380
         : "+g"(h), "+r" (pixels),  "+r" (block)
381
         : "r"((long)line_size)
382
         : "%"REG_a, "memory"
383
        );
384
}
385

    
386
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
387
{
388
    __asm __volatile(
389
         "lea (%3, %3), %%"REG_a"        \n\t"
390
         ".balign 8                        \n\t"
391
         "1:                                \n\t"
392
         "movq (%1), %%mm0                \n\t"
393
         "movq (%1, %3), %%mm1                \n\t"
394
              "movq %%mm0, (%2)                \n\t"
395
         "movq %%mm1, (%2, %3)                \n\t"
396
         "add %%"REG_a", %1                \n\t"
397
         "add %%"REG_a", %2                \n\t"
398
         "movq (%1), %%mm0                \n\t"
399
         "movq (%1, %3), %%mm1                \n\t"
400
         "movq %%mm0, (%2)                \n\t"
401
         "movq %%mm1, (%2, %3)                \n\t"
402
         "add %%"REG_a", %1                \n\t"
403
         "add %%"REG_a", %2                \n\t"
404
         "subl $4, %0                        \n\t"
405
         "jnz 1b                        \n\t"
406
         : "+g"(h), "+r" (pixels),  "+r" (block)
407
         : "r"((long)line_size)
408
         : "%"REG_a, "memory"
409
        );
410
}
411

    
412
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
413
{
414
    __asm __volatile(
415
         "lea (%3, %3), %%"REG_a"        \n\t"
416
         ".balign 8                        \n\t"
417
         "1:                                \n\t"
418
         "movq (%1), %%mm0                \n\t"
419
         "movq 8(%1), %%mm4                \n\t"
420
         "movq (%1, %3), %%mm1                \n\t"
421
         "movq 8(%1, %3), %%mm5                \n\t"
422
              "movq %%mm0, (%2)                \n\t"
423
              "movq %%mm4, 8(%2)                \n\t"
424
         "movq %%mm1, (%2, %3)                \n\t"
425
         "movq %%mm5, 8(%2, %3)                \n\t"
426
         "add %%"REG_a", %1                \n\t"
427
         "add %%"REG_a", %2               \n\t"
428
         "movq (%1), %%mm0                \n\t"
429
         "movq 8(%1), %%mm4                \n\t"
430
         "movq (%1, %3), %%mm1                \n\t"
431
         "movq 8(%1, %3), %%mm5                \n\t"
432
         "movq %%mm0, (%2)                \n\t"
433
         "movq %%mm4, 8(%2)                \n\t"
434
         "movq %%mm1, (%2, %3)                \n\t"
435
         "movq %%mm5, 8(%2, %3)                \n\t"
436
         "add %%"REG_a", %1                \n\t"
437
         "add %%"REG_a", %2               \n\t"
438
         "subl $4, %0                        \n\t"
439
         "jnz 1b                        \n\t"
440
         : "+g"(h), "+r" (pixels),  "+r" (block)
441
         : "r"((long)line_size)
442
         : "%"REG_a, "memory"
443
        );
444
}
445

    
446
static void clear_blocks_mmx(DCTELEM *blocks)
447
{
448
    __asm __volatile(
449
                "pxor %%mm7, %%mm7                \n\t"
450
                "mov $-128*6, %%"REG_a"        \n\t"
451
                "1:                                \n\t"
452
                "movq %%mm7, (%0, %%"REG_a")        \n\t"
453
                "movq %%mm7, 8(%0, %%"REG_a")        \n\t"
454
                "movq %%mm7, 16(%0, %%"REG_a")        \n\t"
455
                "movq %%mm7, 24(%0, %%"REG_a")        \n\t"
456
                "add $32, %%"REG_a"                \n\t"
457
                " js 1b                                \n\t"
458
                : : "r" (((uint8_t *)blocks)+128*6)
459
                : "%"REG_a
460
        );
461
}
462

    
463
#ifdef CONFIG_ENCODERS
464
static int pix_sum16_mmx(uint8_t * pix, int line_size){
465
    const int h=16;
466
    int sum;
467
    long index= -line_size*h;
468

    
469
    __asm __volatile(
470
                "pxor %%mm7, %%mm7                \n\t"
471
                "pxor %%mm6, %%mm6                \n\t"
472
                "1:                                \n\t"
473
                "movq (%2, %1), %%mm0                \n\t"
474
                "movq (%2, %1), %%mm1                \n\t"
475
                "movq 8(%2, %1), %%mm2                \n\t"
476
                "movq 8(%2, %1), %%mm3                \n\t"
477
                "punpcklbw %%mm7, %%mm0                \n\t"
478
                "punpckhbw %%mm7, %%mm1                \n\t"
479
                "punpcklbw %%mm7, %%mm2                \n\t"
480
                "punpckhbw %%mm7, %%mm3                \n\t"
481
                "paddw %%mm0, %%mm1                \n\t"
482
                "paddw %%mm2, %%mm3                \n\t"
483
                "paddw %%mm1, %%mm3                \n\t"
484
                "paddw %%mm3, %%mm6                \n\t"
485
                "add %3, %1                        \n\t"
486
                " js 1b                                \n\t"
487
                "movq %%mm6, %%mm5                \n\t"
488
                "psrlq $32, %%mm6                \n\t"
489
                "paddw %%mm5, %%mm6                \n\t"
490
                "movq %%mm6, %%mm5                \n\t"
491
                "psrlq $16, %%mm6                \n\t"
492
                "paddw %%mm5, %%mm6                \n\t"
493
                "movd %%mm6, %0                        \n\t"
494
                "andl $0xFFFF, %0                \n\t"
495
                : "=&r" (sum), "+r" (index)
496
                : "r" (pix - index), "r" ((long)line_size)
497
        );
498

    
499
        return sum;
500
}
501
#endif //CONFIG_ENCODERS
502

    
503
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
504
    long i=0;
505
    asm volatile(
506
        "1:                                \n\t"
507
        "movq  (%1, %0), %%mm0                \n\t"
508
        "movq  (%2, %0), %%mm1                \n\t"
509
        "paddb %%mm0, %%mm1                \n\t"
510
        "movq %%mm1, (%2, %0)                \n\t"
511
        "movq 8(%1, %0), %%mm0                \n\t"
512
        "movq 8(%2, %0), %%mm1                \n\t"
513
        "paddb %%mm0, %%mm1                \n\t"
514
        "movq %%mm1, 8(%2, %0)                \n\t"
515
        "add $16, %0                        \n\t"
516
        "cmp %3, %0                        \n\t"
517
        " jb 1b                                \n\t"
518
        : "+r" (i)
519
        : "r"(src), "r"(dst), "r"((long)w-15)
520
    );
521
    for(; i<w; i++)
522
        dst[i+0] += src[i+0];
523
}
524

    
525
#define H263_LOOP_FILTER \
526
        "pxor %%mm7, %%mm7                \n\t"\
527
        "movq  %0, %%mm0                \n\t"\
528
        "movq  %0, %%mm1                \n\t"\
529
        "movq  %3, %%mm2                \n\t"\
530
        "movq  %3, %%mm3                \n\t"\
531
        "punpcklbw %%mm7, %%mm0                \n\t"\
532
        "punpckhbw %%mm7, %%mm1                \n\t"\
533
        "punpcklbw %%mm7, %%mm2                \n\t"\
534
        "punpckhbw %%mm7, %%mm3                \n\t"\
535
        "psubw %%mm2, %%mm0                \n\t"\
536
        "psubw %%mm3, %%mm1                \n\t"\
537
        "movq  %1, %%mm2                \n\t"\
538
        "movq  %1, %%mm3                \n\t"\
539
        "movq  %2, %%mm4                \n\t"\
540
        "movq  %2, %%mm5                \n\t"\
541
        "punpcklbw %%mm7, %%mm2                \n\t"\
542
        "punpckhbw %%mm7, %%mm3                \n\t"\
543
        "punpcklbw %%mm7, %%mm4                \n\t"\
544
        "punpckhbw %%mm7, %%mm5                \n\t"\
545
        "psubw %%mm2, %%mm4                \n\t"\
546
        "psubw %%mm3, %%mm5                \n\t"\
547
        "psllw $2, %%mm4                \n\t"\
548
        "psllw $2, %%mm5                \n\t"\
549
        "paddw %%mm0, %%mm4                \n\t"\
550
        "paddw %%mm1, %%mm5                \n\t"\
551
        "pxor %%mm6, %%mm6                \n\t"\
552
        "pcmpgtw %%mm4, %%mm6                \n\t"\
553
        "pcmpgtw %%mm5, %%mm7                \n\t"\
554
        "pxor %%mm6, %%mm4                \n\t"\
555
        "pxor %%mm7, %%mm5                \n\t"\
556
        "psubw %%mm6, %%mm4                \n\t"\
557
        "psubw %%mm7, %%mm5                \n\t"\
558
        "psrlw $3, %%mm4                \n\t"\
559
        "psrlw $3, %%mm5                \n\t"\
560
        "packuswb %%mm5, %%mm4                \n\t"\
561
        "packsswb %%mm7, %%mm6                \n\t"\
562
        "pxor %%mm7, %%mm7                \n\t"\
563
        "movd %4, %%mm2                        \n\t"\
564
        "punpcklbw %%mm2, %%mm2                \n\t"\
565
        "punpcklbw %%mm2, %%mm2                \n\t"\
566
        "punpcklbw %%mm2, %%mm2                \n\t"\
567
        "psubusb %%mm4, %%mm2                \n\t"\
568
        "movq %%mm2, %%mm3                \n\t"\
569
        "psubusb %%mm4, %%mm3                \n\t"\
570
        "psubb %%mm3, %%mm2                \n\t"\
571
        "movq %1, %%mm3                        \n\t"\
572
        "movq %2, %%mm4                        \n\t"\
573
        "pxor %%mm6, %%mm3                \n\t"\
574
        "pxor %%mm6, %%mm4                \n\t"\
575
        "paddusb %%mm2, %%mm3                \n\t"\
576
        "psubusb %%mm2, %%mm4                \n\t"\
577
        "pxor %%mm6, %%mm3                \n\t"\
578
        "pxor %%mm6, %%mm4                \n\t"\
579
        "paddusb %%mm2, %%mm2                \n\t"\
580
        "packsswb %%mm1, %%mm0                \n\t"\
581
        "pcmpgtb %%mm0, %%mm7                \n\t"\
582
        "pxor %%mm7, %%mm0                \n\t"\
583
        "psubb %%mm7, %%mm0                \n\t"\
584
        "movq %%mm0, %%mm1                \n\t"\
585
        "psubusb %%mm2, %%mm0                \n\t"\
586
        "psubb %%mm0, %%mm1                \n\t"\
587
        "pand %5, %%mm1                        \n\t"\
588
        "psrlw $2, %%mm1                \n\t"\
589
        "pxor %%mm7, %%mm1                \n\t"\
590
        "psubb %%mm7, %%mm1                \n\t"\
591
        "movq %0, %%mm5                        \n\t"\
592
        "movq %3, %%mm6                        \n\t"\
593
        "psubb %%mm1, %%mm5                \n\t"\
594
        "paddb %%mm1, %%mm6                \n\t"
595

    
596
static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
597
    const int strength= ff_h263_loop_filter_strength[qscale];
598

    
599
    asm volatile(
600
    
601
        H263_LOOP_FILTER
602
        
603
        "movq %%mm3, %1                        \n\t"
604
        "movq %%mm4, %2                        \n\t"
605
        "movq %%mm5, %0                        \n\t"
606
        "movq %%mm6, %3                        \n\t"
607
        : "+m" (*(uint64_t*)(src - 2*stride)),
608
          "+m" (*(uint64_t*)(src - 1*stride)),
609
          "+m" (*(uint64_t*)(src + 0*stride)),
610
          "+m" (*(uint64_t*)(src + 1*stride))
611
        : "g" (2*strength), "m"(ff_pb_FC)
612
    );
613
}
614

    
615
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
616
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
617
        "movd  %4, %%mm0                \n\t"
618
        "movd  %5, %%mm1                \n\t"
619
        "movd  %6, %%mm2                \n\t"
620
        "movd  %7, %%mm3                \n\t"
621
        "punpcklbw %%mm1, %%mm0                \n\t"
622
        "punpcklbw %%mm3, %%mm2                \n\t"
623
        "movq %%mm0, %%mm1                \n\t"
624
        "punpcklwd %%mm2, %%mm0                \n\t"
625
        "punpckhwd %%mm2, %%mm1                \n\t"
626
        "movd  %%mm0, %0                \n\t"
627
        "punpckhdq %%mm0, %%mm0                \n\t"
628
        "movd  %%mm0, %1                \n\t"
629
        "movd  %%mm1, %2                \n\t"
630
        "punpckhdq %%mm1, %%mm1                \n\t"
631
        "movd  %%mm1, %3                \n\t"
632
        
633
        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
634
          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
635
          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
636
          "=m" (*(uint32_t*)(dst + 3*dst_stride))
637
        :  "m" (*(uint32_t*)(src + 0*src_stride)),
638
           "m" (*(uint32_t*)(src + 1*src_stride)),
639
           "m" (*(uint32_t*)(src + 2*src_stride)),
640
           "m" (*(uint32_t*)(src + 3*src_stride))
641
    );
642
}
643

    
644
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
645
    const int strength= ff_h263_loop_filter_strength[qscale];
646
    uint64_t temp[4] __attribute__ ((aligned(8)));
647
    uint8_t *btemp= (uint8_t*)temp;
648
    
649
    src -= 2;
650

    
651
    transpose4x4(btemp  , src           , 8, stride);
652
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
653
    asm volatile(
654
        H263_LOOP_FILTER // 5 3 4 6
655
        
656
        : "+m" (temp[0]),
657
          "+m" (temp[1]),
658
          "+m" (temp[2]),
659
          "+m" (temp[3])
660
        : "g" (2*strength), "m"(ff_pb_FC)
661
    );
662

    
663
    asm volatile(
664
        "movq %%mm5, %%mm1                \n\t"
665
        "movq %%mm4, %%mm0                \n\t"
666
        "punpcklbw %%mm3, %%mm5                \n\t"
667
        "punpcklbw %%mm6, %%mm4                \n\t"
668
        "punpckhbw %%mm3, %%mm1                \n\t"
669
        "punpckhbw %%mm6, %%mm0                \n\t"
670
        "movq %%mm5, %%mm3                \n\t"
671
        "movq %%mm1, %%mm6                \n\t"
672
        "punpcklwd %%mm4, %%mm5                \n\t"
673
        "punpcklwd %%mm0, %%mm1                \n\t"
674
        "punpckhwd %%mm4, %%mm3                \n\t"
675
        "punpckhwd %%mm0, %%mm6                \n\t"
676
        "movd %%mm5, (%0)                \n\t"
677
        "punpckhdq %%mm5, %%mm5                \n\t"
678
        "movd %%mm5, (%0,%2)                \n\t"
679
        "movd %%mm3, (%0,%2,2)                \n\t"
680
        "punpckhdq %%mm3, %%mm3                \n\t"
681
        "movd %%mm3, (%0,%3)                \n\t"
682
        "movd %%mm1, (%1)                \n\t"
683
        "punpckhdq %%mm1, %%mm1                \n\t"
684
        "movd %%mm1, (%1,%2)                \n\t"
685
        "movd %%mm6, (%1,%2,2)                \n\t"
686
        "punpckhdq %%mm6, %%mm6                \n\t"
687
        "movd %%mm6, (%1,%3)                \n\t"
688
        :: "r" (src),
689
           "r" (src + 4*stride),
690
           "r" ((long)   stride ),
691
           "r" ((long)(3*stride))
692
    );
693
}
694

    
695
// dst = ABS( a - b )
696
#define MMABS_DIFF_MMX2(a,b,dst,z)\
697
    "movq    " #b ", " #dst " \n\t"\
698
    "movq    " #a ", " #z   " \n\t"\
699
    "psubusw " #b ", " #z   " \n\t"\
700
    "psubusw " #a ", " #dst " \n\t"\
701
    "pmaxsw  " #z ", " #dst " \n\t"
702

    
703
// a = clip( a, -tc, tc )
704
#define CLIP_MMX2(a,tc,z)\
705
    "pxor    " #z  ", " #z "  \n\t"\
706
    "psubw   " #tc ", " #z "  \n\t"\
707
    "pmaxsw  " #z  ", " #a "  \n\t"\
708
    "pminsw  " #tc ", " #a "  \n\t"
709

    
710
// in: mm0=p1, mm1=p0, mm2=q0, mm3=q1
711
// out: mm7 = do we filter this pixel?
712
#define H264_DEBLOCK_THRESH(alpha,beta)\
713
    "pxor      %%mm7, %%mm7     \n\t"\
714
    "punpcklbw %%mm7, %%mm0     \n\t"\
715
    "punpcklbw %%mm7, %%mm1     \n\t"\
716
    "punpcklbw %%mm7, %%mm2     \n\t"\
717
    "punpcklbw %%mm7, %%mm3     \n\t"\
718
    MMABS_DIFF_MMX2(%%mm1, %%mm2, %%mm5, %%mm4)\
719
    "movd " #alpha ", %%mm6     \n\t"\
720
    "pshufw    $0, %%mm6, %%mm6 \n\t"\
721
    "pcmpgtw   %%mm5, %%mm6     \n\t" /* ABS(p0-q0) < alpha */\
722
    MMABS_DIFF_MMX2(%%mm0, %%mm1, %%mm5, %%mm4)\
723
    MMABS_DIFF_MMX2(%%mm3, %%mm2, %%mm7, %%mm4)\
724
    "pmaxsw    %%mm7, %%mm5     \n\t"\
725
    "movd  " #beta ", %%mm7     \n\t"\
726
    "pshufw    $0, %%mm7, %%mm7 \n\t"\
727
    "movq      %%mm7, %%mm4     \n\t"\
728
    "pcmpgtw   %%mm5, %%mm7     \n\t" /* ABS(p1-p0) < beta && ABS(q1-q0) < beta */\
729
    "pand      %%mm6, %%mm7     \n\t"
730

    
731
// in: mm0=p1, mm1=p0, mm2=q0, mm3=q1, mm6=tc
732
// out: mm1=p0', mm2=q0'
733
#define H264_DEBLOCK_P0_Q0(pw4)\
734
    "movq   " #pw4 ", %%mm4     \n\t"\
735
    "movq      %%mm2, %%mm5     \n\t"\
736
    "paddw     %%mm4, %%mm0     \n\t"\
737
    "psubw     %%mm1, %%mm5     \n\t"\
738
    "psubw     %%mm3, %%mm0     \n\t"\
739
    "psllw     $2,    %%mm5     \n\t"\
740
    "paddw     %%mm0, %%mm5     \n\t"\
741
    "psraw     $3,    %%mm5     \n\t" /* mm5 = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3 */\
742
    CLIP_MMX2(%%mm5, %%mm6, %%mm4)    /* delta = clip( mm5, -tc, tc ) */\
743
    "paddw     %%mm5, %%mm1     \n\t" /* p0 += delta */\
744
    "psubw     %%mm5, %%mm2     \n\t" /* q0 -= delta */
745

    
746
// in: mm1=p0, mm2=q0, mm6=tc0
747
// out: mm5=delta
748
#define H264_DEBLOCK_DELTA_PQ1(p1,p2,z)\
749
    "movq      %%mm1, %%mm5     \n\t"\
750
    "pavgb     %%mm2, %%mm5     \n\t"\
751
    "paddw   " #p2 ", %%mm5     \n\t"\
752
    "psraw     $1, %%mm5        \n\t"\
753
    "psubw   " #p1 ", %%mm5     \n\t" /* ( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 */\
754
    CLIP_MMX2(%%mm5, %%mm6, z)
755

    
756
static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int tc0)
757
{
758
    uint64_t tmp0, tmp1;
759
    asm volatile(
760
        "movd  (%2,%4),   %%mm0        \n\t" //p1
761
        "movd  (%2,%4,2), %%mm1        \n\t" //p0
762
        "movd  (%3),      %%mm2        \n\t" //q0
763
        "movd  (%3,%4),   %%mm3        \n\t" //q1
764
        H264_DEBLOCK_THRESH(%6,%7)
765
        "movq      %%mm7, %0           \n\t"
766

    
767
// filter p1 if ABS(p2-p0) < beta
768
        "movd      (%2),  %%mm3        \n\t"
769
        "pxor      %%mm6, %%mm6        \n\t"
770
        "punpcklbw %%mm6, %%mm3        \n\t" //p2
771
        MMABS_DIFF_MMX2(%%mm1, %%mm3, %%mm5, %%mm6)
772
        "pcmpgtw   %%mm5, %%mm4        \n\t"
773
        "pand      %%mm7, %%mm4        \n\t" // mm4 = ( ABS( p2 - p0 ) < beta && filterp )
774
        "movd      %5,    %%mm6        \n\t"
775
        "pshufw    $0, %%mm6, %%mm6    \n\t" //tc
776

    
777
        H264_DEBLOCK_DELTA_PQ1(%%mm0, %%mm3, %%mm7) // delta = clip( ( p2 + ((p0+q0+1)>>1) ) >> 1 ) - p1 )
778
        "pand      %%mm4, %%mm5        \n\t"
779
        "paddw     %%mm0, %%mm5        \n\t"
780
        "packuswb  %%mm5, %%mm5        \n\t"
781
        "movd      %%mm5, (%2,%4)      \n\t" // *p1 += delta
782
        "psrlw     $15, %%mm4          \n\t"
783
        "paddw     %%mm6, %%mm4        \n\t" // tc++
784
        "movq      %%mm4, %1           \n\t"
785

    
786
// filter q1 if ABS(q2-q0) < beta
787
        "pxor      %%mm7, %%mm7        \n\t"
788
        "movd  (%3,%4),   %%mm3        \n\t" //q1
789
        "movd  (%3,%4,2), %%mm4        \n\t" //q2
790
        "punpcklbw %%mm7, %%mm3        \n\t"
791
        "punpcklbw %%mm7, %%mm4        \n\t"
792
        MMABS_DIFF_MMX2(%%mm2, %%mm4, %%mm5, %%mm7)
793
        "movd      %7,    %%mm7        \n\t"
794
        "pshufw    $0, %%mm7, %%mm7    \n\t"
795
        "pcmpgtw   %%mm5, %%mm7        \n\t"
796

    
797
        H264_DEBLOCK_DELTA_PQ1(%%mm3, %%mm4, %%mm4) // delta = clip( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 )
798
        "movq      %0,    %%mm4        \n\t"
799
        "pand      %%mm4, %%mm7        \n\t" // mm7 = ( ABS( q2 - q0 ) < beta && filterp )
800
        "pand      %%mm7, %%mm5        \n\t"
801
        "paddw     %%mm3, %%mm5        \n\t"
802
        "packuswb  %%mm5, %%mm5        \n\t"
803
        "movd      %%mm5, (%3,%4)      \n\t" // *q1 += delta
804
        "movq      %1, %%mm6           \n\t"
805
        "psrlw     $15, %%mm7          \n\t"
806
        "paddw     %%mm7, %%mm6        \n\t" // tc++
807
        "movq      %0,    %%mm4        \n\t"
808
        "pand      %%mm4, %%mm6        \n\t"
809

    
810
        H264_DEBLOCK_P0_Q0(%8)
811
        "packuswb  %%mm1, %%mm1        \n\t"
812
        "packuswb  %%mm2, %%mm2        \n\t"
813
        "movd      %%mm1, (%2,%4,2)    \n\t"
814
        "movd      %%mm2, (%3)         \n\t"
815

    
816
        : "=m"(tmp0), "=m"(tmp1)
817
        : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
818
          "r"(tc0), "r"(alpha), "r"(beta), "m"(ff_pw_4)
819
    );
820
}
821

    
822
static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
823
{
824
    int i;
825
    for(i=0; i<4; i++, pix+=4) {
826
        if(tc0[i] < 0)
827
            continue;
828
        h264_loop_filter_luma_mmx2(pix, stride, alpha, beta, tc0[i]);
829
    }
830
}
831

    
832
static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
833
{
834
    uint8_t trans[4*8];
835
    int i;
836
    for(i=0; i<4; i++, pix+=4*stride) {
837
        if(tc0[i] < 0)
838
            continue;
839
        //FIXME: could cut some load/stores by merging transpose with filter
840
        transpose4x4(trans, pix-4, 4, stride);
841
        transpose4x4(trans+4*4, pix, 4, stride);
842
        h264_loop_filter_luma_mmx2(trans+4*4, 4, alpha, beta, tc0[i]);
843
        transpose4x4(pix-2, trans+2*4, stride, 4);
844
    }
845
}
846

    
847
static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
848
{
849
    asm volatile(
850
        "movd    (%0),    %%mm0     \n\t"
851
        "movd    (%0,%2), %%mm1     \n\t"
852
        "movd    (%1),    %%mm2     \n\t"
853
        "movd    (%1,%2), %%mm3     \n\t"
854
        H264_DEBLOCK_THRESH(%4,%5)
855
        "movd      %3,    %%mm6     \n\t"
856
        "pshufw $0x50, %%mm6, %%mm6 \n\t" // mm6 = tc[1], tc[1], tc[0], tc[0]
857
        "pand      %%mm7, %%mm6     \n\t"
858
        H264_DEBLOCK_P0_Q0(%6)
859
        "packuswb  %%mm1, %%mm1     \n\t"
860
        "packuswb  %%mm2, %%mm2     \n\t"
861
        "movd      %%mm1, (%0,%2)   \n\t"
862
        "movd      %%mm2, (%1)      \n\t"
863
        :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
864
           "r"(tc0[1]<<16 | tc0[0]),
865
           "r"(alpha), "g"(beta), "m"(ff_pw_4)
866
    );
867
}
868

    
869
static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
870
{
871
    int i;
872
    for(i=0; i<2; i++) {
873
        h264_loop_filter_chroma_mmx2(pix, stride, alpha, beta, tc0);
874
        pix += 4;
875
        tc0 += 2;
876
    }
877
}
878

    
879
static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
880
{
881
    uint8_t trans[4*4];
882
    int i;
883
    for(i=0; i<2; i++) {
884
        //FIXME: could cut some load/stores by merging transpose with filter
885
        transpose4x4(trans, pix-2, 4, stride);
886
        h264_loop_filter_chroma_mmx2(trans+2*4, 4, alpha, beta, tc0);
887
        transpose4x4(pix-2, trans, stride, 4);
888
        pix += 4*stride;
889
        tc0 += 2;
890
    }
891
}
892

    
893
#ifdef CONFIG_ENCODERS
894
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
895
    int tmp;
896
  asm volatile (
897
      "movl $16,%%ecx\n"
898
      "pxor %%mm0,%%mm0\n"
899
      "pxor %%mm7,%%mm7\n"
900
      "1:\n"
901
      "movq (%0),%%mm2\n"        /* mm2 = pix[0-7] */
902
      "movq 8(%0),%%mm3\n"        /* mm3 = pix[8-15] */
903

    
904
      "movq %%mm2,%%mm1\n"        /* mm1 = mm2 = pix[0-7] */
905

    
906
      "punpckhbw %%mm0,%%mm1\n"        /* mm1 = [pix4-7] */
907
      "punpcklbw %%mm0,%%mm2\n"        /* mm2 = [pix0-3] */
908

    
909
      "movq %%mm3,%%mm4\n"        /* mm4 = mm3 = pix[8-15] */
910
      "punpckhbw %%mm0,%%mm3\n"        /* mm3 = [pix12-15] */
911
      "punpcklbw %%mm0,%%mm4\n"        /* mm4 = [pix8-11] */
912

    
913
      "pmaddwd %%mm1,%%mm1\n"        /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
914
      "pmaddwd %%mm2,%%mm2\n"        /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
915

    
916
      "pmaddwd %%mm3,%%mm3\n"
917
      "pmaddwd %%mm4,%%mm4\n"
918

    
919
      "paddd %%mm1,%%mm2\n"        /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
920
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
921
      "paddd %%mm3,%%mm4\n"
922
      "paddd %%mm2,%%mm7\n"
923

    
924
      "add %2, %0\n"
925
      "paddd %%mm4,%%mm7\n"
926
      "dec %%ecx\n"
927
      "jnz 1b\n"
928

    
929
      "movq %%mm7,%%mm1\n"
930
      "psrlq $32, %%mm7\n"        /* shift hi dword to lo */
931
      "paddd %%mm7,%%mm1\n"
932
      "movd %%mm1,%1\n"
933
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
934
    return tmp;
935
}
936

    
937
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
938
    int tmp;
939
  asm volatile (
940
      "movl %4,%%ecx\n"
941
      "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
942
      "pxor %%mm7,%%mm7\n"        /* mm7 holds the sum */
943
      "1:\n"
944
      "movq (%0),%%mm1\n"        /* mm1 = pix1[0-7] */
945
      "movq (%1),%%mm2\n"        /* mm2 = pix2[0-7] */
946

    
947
      "movq %%mm1,%%mm5\n"
948
      "psubusb %%mm2,%%mm1\n"
949
      "psubusb %%mm5,%%mm2\n"
950

    
951
      "por %%mm1,%%mm2\n"
952

    
953
      "movq %%mm2,%%mm1\n"
954

    
955
      "punpckhbw %%mm0,%%mm2\n"
956
      "punpcklbw %%mm0,%%mm1\n"        /* mm1 now spread over (mm1,mm2) */
957

    
958
      "pmaddwd %%mm2,%%mm2\n"
959
      "pmaddwd %%mm1,%%mm1\n"
960

    
961
      "add %3,%0\n"
962
      "add %3,%1\n"
963

    
964
      "paddd %%mm2,%%mm1\n"
965
      "paddd %%mm1,%%mm7\n"
966

    
967
      "decl %%ecx\n"
968
      "jnz 1b\n"
969

    
970
      "movq %%mm7,%%mm1\n"
971
      "psrlq $32, %%mm7\n"        /* shift hi dword to lo */
972
      "paddd %%mm7,%%mm1\n"
973
      "movd %%mm1,%2\n"
974
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
975
      : "r" ((long)line_size) , "m" (h)
976
      : "%ecx");
977
    return tmp;
978
}
979

    
980
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
981
    int tmp;
982
  asm volatile (
983
      "movl %4,%%ecx\n"
984
      "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
985
      "pxor %%mm7,%%mm7\n"        /* mm7 holds the sum */
986
      "1:\n"
987
      "movq (%0),%%mm1\n"        /* mm1 = pix1[0-7] */
988
      "movq (%1),%%mm2\n"        /* mm2 = pix2[0-7] */
989
      "movq 8(%0),%%mm3\n"        /* mm3 = pix1[8-15] */
990
      "movq 8(%1),%%mm4\n"        /* mm4 = pix2[8-15] */
991

    
992
      /* todo: mm1-mm2, mm3-mm4 */
993
      /* algo: substract mm1 from mm2 with saturation and vice versa */
994
      /*       OR the results to get absolute difference */
995
      "movq %%mm1,%%mm5\n"
996
      "movq %%mm3,%%mm6\n"
997
      "psubusb %%mm2,%%mm1\n"
998
      "psubusb %%mm4,%%mm3\n"
999
      "psubusb %%mm5,%%mm2\n"
1000
      "psubusb %%mm6,%%mm4\n"
1001

    
1002
      "por %%mm1,%%mm2\n"
1003
      "por %%mm3,%%mm4\n"
1004

    
1005
      /* now convert to 16-bit vectors so we can square them */
1006
      "movq %%mm2,%%mm1\n"
1007
      "movq %%mm4,%%mm3\n"
1008

    
1009
      "punpckhbw %%mm0,%%mm2\n"
1010
      "punpckhbw %%mm0,%%mm4\n"
1011
      "punpcklbw %%mm0,%%mm1\n"        /* mm1 now spread over (mm1,mm2) */
1012
      "punpcklbw %%mm0,%%mm3\n"        /* mm4 now spread over (mm3,mm4) */
1013

    
1014
      "pmaddwd %%mm2,%%mm2\n"
1015
      "pmaddwd %%mm4,%%mm4\n"
1016
      "pmaddwd %%mm1,%%mm1\n"
1017
      "pmaddwd %%mm3,%%mm3\n"
1018

    
1019
      "add %3,%0\n"
1020
      "add %3,%1\n"
1021

    
1022
      "paddd %%mm2,%%mm1\n"
1023
      "paddd %%mm4,%%mm3\n"
1024
      "paddd %%mm1,%%mm7\n"
1025
      "paddd %%mm3,%%mm7\n"
1026

    
1027
      "decl %%ecx\n"
1028
      "jnz 1b\n"
1029

    
1030
      "movq %%mm7,%%mm1\n"
1031
      "psrlq $32, %%mm7\n"        /* shift hi dword to lo */
1032
      "paddd %%mm7,%%mm1\n"
1033
      "movd %%mm1,%2\n"
1034
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
1035
      : "r" ((long)line_size) , "m" (h)
1036
      : "%ecx");
1037
    return tmp;
1038
}
1039

    
1040
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
1041
    int tmp;
1042
  asm volatile (
1043
      "movl %3,%%ecx\n"
1044
      "pxor %%mm7,%%mm7\n"
1045
      "pxor %%mm6,%%mm6\n"
1046
      
1047
      "movq (%0),%%mm0\n"
1048
      "movq %%mm0, %%mm1\n"
1049
      "psllq $8, %%mm0\n"
1050
      "psrlq $8, %%mm1\n"
1051
      "psrlq $8, %%mm0\n"
1052
      "movq %%mm0, %%mm2\n"
1053
      "movq %%mm1, %%mm3\n"
1054
      "punpcklbw %%mm7,%%mm0\n"
1055
      "punpcklbw %%mm7,%%mm1\n"
1056
      "punpckhbw %%mm7,%%mm2\n"
1057
      "punpckhbw %%mm7,%%mm3\n"
1058
      "psubw %%mm1, %%mm0\n"
1059
      "psubw %%mm3, %%mm2\n"
1060
      
1061
      "add %2,%0\n"
1062
      
1063
      "movq (%0),%%mm4\n"
1064
      "movq %%mm4, %%mm1\n"
1065
      "psllq $8, %%mm4\n"
1066
      "psrlq $8, %%mm1\n"
1067
      "psrlq $8, %%mm4\n"
1068
      "movq %%mm4, %%mm5\n"
1069
      "movq %%mm1, %%mm3\n"
1070
      "punpcklbw %%mm7,%%mm4\n"
1071
      "punpcklbw %%mm7,%%mm1\n"
1072
      "punpckhbw %%mm7,%%mm5\n"
1073
      "punpckhbw %%mm7,%%mm3\n"
1074
      "psubw %%mm1, %%mm4\n"
1075
      "psubw %%mm3, %%mm5\n"
1076
      "psubw %%mm4, %%mm0\n"
1077
      "psubw %%mm5, %%mm2\n"
1078
      "pxor %%mm3, %%mm3\n"
1079
      "pxor %%mm1, %%mm1\n"
1080
      "pcmpgtw %%mm0, %%mm3\n\t"
1081
      "pcmpgtw %%mm2, %%mm1\n\t"
1082
      "pxor %%mm3, %%mm0\n"
1083
      "pxor %%mm1, %%mm2\n"
1084
      "psubw %%mm3, %%mm0\n" 
1085
      "psubw %%mm1, %%mm2\n"
1086
      "paddw %%mm0, %%mm2\n"
1087
      "paddw %%mm2, %%mm6\n"
1088

    
1089
      "add %2,%0\n"
1090
      "1:\n"
1091
  
1092
      "movq (%0),%%mm0\n"
1093
      "movq %%mm0, %%mm1\n"
1094
      "psllq $8, %%mm0\n"
1095
      "psrlq $8, %%mm1\n"
1096
      "psrlq $8, %%mm0\n"
1097
      "movq %%mm0, %%mm2\n"
1098
      "movq %%mm1, %%mm3\n"
1099
      "punpcklbw %%mm7,%%mm0\n"
1100
      "punpcklbw %%mm7,%%mm1\n"
1101
      "punpckhbw %%mm7,%%mm2\n"
1102
      "punpckhbw %%mm7,%%mm3\n"
1103
      "psubw %%mm1, %%mm0\n"
1104
      "psubw %%mm3, %%mm2\n"
1105
      "psubw %%mm0, %%mm4\n"
1106
      "psubw %%mm2, %%mm5\n"
1107
      "pxor %%mm3, %%mm3\n"
1108
      "pxor %%mm1, %%mm1\n"
1109
      "pcmpgtw %%mm4, %%mm3\n\t"
1110
      "pcmpgtw %%mm5, %%mm1\n\t"
1111
      "pxor %%mm3, %%mm4\n"
1112
      "pxor %%mm1, %%mm5\n"
1113
      "psubw %%mm3, %%mm4\n" 
1114
      "psubw %%mm1, %%mm5\n"
1115
      "paddw %%mm4, %%mm5\n"
1116
      "paddw %%mm5, %%mm6\n"
1117
      
1118
      "add %2,%0\n"
1119
      
1120
      "movq (%0),%%mm4\n"
1121
      "movq %%mm4, %%mm1\n"
1122
      "psllq $8, %%mm4\n"
1123
      "psrlq $8, %%mm1\n"
1124
      "psrlq $8, %%mm4\n"
1125
      "movq %%mm4, %%mm5\n"
1126
      "movq %%mm1, %%mm3\n"
1127
      "punpcklbw %%mm7,%%mm4\n"
1128
      "punpcklbw %%mm7,%%mm1\n"
1129
      "punpckhbw %%mm7,%%mm5\n"
1130
      "punpckhbw %%mm7,%%mm3\n"
1131
      "psubw %%mm1, %%mm4\n"
1132
      "psubw %%mm3, %%mm5\n"
1133
      "psubw %%mm4, %%mm0\n"
1134
      "psubw %%mm5, %%mm2\n"
1135
      "pxor %%mm3, %%mm3\n"
1136
      "pxor %%mm1, %%mm1\n"
1137
      "pcmpgtw %%mm0, %%mm3\n\t"
1138
      "pcmpgtw %%mm2, %%mm1\n\t"
1139
      "pxor %%mm3, %%mm0\n"
1140
      "pxor %%mm1, %%mm2\n"
1141
      "psubw %%mm3, %%mm0\n" 
1142
      "psubw %%mm1, %%mm2\n"
1143
      "paddw %%mm0, %%mm2\n"
1144
      "paddw %%mm2, %%mm6\n"
1145

    
1146
      "add %2,%0\n"
1147
      "subl $2, %%ecx\n"
1148
      " jnz 1b\n"
1149

    
1150
      "movq %%mm6, %%mm0\n"
1151
      "punpcklwd %%mm7,%%mm0\n"
1152
      "punpckhwd %%mm7,%%mm6\n"
1153
      "paddd %%mm0, %%mm6\n"
1154
      
1155
      "movq %%mm6,%%mm0\n"
1156
      "psrlq $32, %%mm6\n"
1157
      "paddd %%mm6,%%mm0\n"
1158
      "movd %%mm0,%1\n"
1159
      : "+r" (pix1), "=r"(tmp) 
1160
      : "r" ((long)line_size) , "g" (h-2)
1161
      : "%ecx");
1162
      return tmp;
1163
}
1164

    
1165
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1166
    int tmp;
1167
    uint8_t * pix= pix1;
1168
  asm volatile (
1169
      "movl %3,%%ecx\n"
1170
      "pxor %%mm7,%%mm7\n"
1171
      "pxor %%mm6,%%mm6\n"
1172
      
1173
      "movq (%0),%%mm0\n"
1174
      "movq 1(%0),%%mm1\n"
1175
      "movq %%mm0, %%mm2\n"
1176
      "movq %%mm1, %%mm3\n"
1177
      "punpcklbw %%mm7,%%mm0\n"
1178
      "punpcklbw %%mm7,%%mm1\n"
1179
      "punpckhbw %%mm7,%%mm2\n"
1180
      "punpckhbw %%mm7,%%mm3\n"
1181
      "psubw %%mm1, %%mm0\n"
1182
      "psubw %%mm3, %%mm2\n"
1183
      
1184
      "add %2,%0\n"
1185
      
1186
      "movq (%0),%%mm4\n"
1187
      "movq 1(%0),%%mm1\n"
1188
      "movq %%mm4, %%mm5\n"
1189
      "movq %%mm1, %%mm3\n"
1190
      "punpcklbw %%mm7,%%mm4\n"
1191
      "punpcklbw %%mm7,%%mm1\n"
1192
      "punpckhbw %%mm7,%%mm5\n"
1193
      "punpckhbw %%mm7,%%mm3\n"
1194
      "psubw %%mm1, %%mm4\n"
1195
      "psubw %%mm3, %%mm5\n"
1196
      "psubw %%mm4, %%mm0\n"
1197
      "psubw %%mm5, %%mm2\n"
1198
      "pxor %%mm3, %%mm3\n"
1199
      "pxor %%mm1, %%mm1\n"
1200
      "pcmpgtw %%mm0, %%mm3\n\t"
1201
      "pcmpgtw %%mm2, %%mm1\n\t"
1202
      "pxor %%mm3, %%mm0\n"
1203
      "pxor %%mm1, %%mm2\n"
1204
      "psubw %%mm3, %%mm0\n" 
1205
      "psubw %%mm1, %%mm2\n"
1206
      "paddw %%mm0, %%mm2\n"
1207
      "paddw %%mm2, %%mm6\n"
1208

    
1209
      "add %2,%0\n"
1210
      "1:\n"
1211
  
1212
      "movq (%0),%%mm0\n"
1213
      "movq 1(%0),%%mm1\n"
1214
      "movq %%mm0, %%mm2\n"
1215
      "movq %%mm1, %%mm3\n"
1216
      "punpcklbw %%mm7,%%mm0\n"
1217
      "punpcklbw %%mm7,%%mm1\n"
1218
      "punpckhbw %%mm7,%%mm2\n"
1219
      "punpckhbw %%mm7,%%mm3\n"
1220
      "psubw %%mm1, %%mm0\n"
1221
      "psubw %%mm3, %%mm2\n"
1222
      "psubw %%mm0, %%mm4\n"
1223
      "psubw %%mm2, %%mm5\n"
1224
      "pxor %%mm3, %%mm3\n"
1225
      "pxor %%mm1, %%mm1\n"
1226
      "pcmpgtw %%mm4, %%mm3\n\t"
1227
      "pcmpgtw %%mm5, %%mm1\n\t"
1228
      "pxor %%mm3, %%mm4\n"
1229
      "pxor %%mm1, %%mm5\n"
1230
      "psubw %%mm3, %%mm4\n"
1231
      "psubw %%mm1, %%mm5\n"
1232
      "paddw %%mm4, %%mm5\n"
1233
      "paddw %%mm5, %%mm6\n"
1234
      
1235
      "add %2,%0\n"
1236
      
1237
      "movq (%0),%%mm4\n"
1238
      "movq 1(%0),%%mm1\n"
1239
      "movq %%mm4, %%mm5\n"
1240
      "movq %%mm1, %%mm3\n"
1241
      "punpcklbw %%mm7,%%mm4\n"
1242
      "punpcklbw %%mm7,%%mm1\n"
1243
      "punpckhbw %%mm7,%%mm5\n"
1244
      "punpckhbw %%mm7,%%mm3\n"
1245
      "psubw %%mm1, %%mm4\n"
1246
      "psubw %%mm3, %%mm5\n"
1247
      "psubw %%mm4, %%mm0\n"
1248
      "psubw %%mm5, %%mm2\n"
1249
      "pxor %%mm3, %%mm3\n"
1250
      "pxor %%mm1, %%mm1\n"
1251
      "pcmpgtw %%mm0, %%mm3\n\t"
1252
      "pcmpgtw %%mm2, %%mm1\n\t"
1253
      "pxor %%mm3, %%mm0\n"
1254
      "pxor %%mm1, %%mm2\n"
1255
      "psubw %%mm3, %%mm0\n" 
1256
      "psubw %%mm1, %%mm2\n"
1257
      "paddw %%mm0, %%mm2\n"
1258
      "paddw %%mm2, %%mm6\n"
1259

    
1260
      "add %2,%0\n"
1261
      "subl $2, %%ecx\n"
1262
      " jnz 1b\n"
1263

    
1264
      "movq %%mm6, %%mm0\n"
1265
      "punpcklwd %%mm7,%%mm0\n"
1266
      "punpckhwd %%mm7,%%mm6\n"
1267
      "paddd %%mm0, %%mm6\n"
1268
      
1269
      "movq %%mm6,%%mm0\n"
1270
      "psrlq $32, %%mm6\n"
1271
      "paddd %%mm6,%%mm0\n"
1272
      "movd %%mm0,%1\n"
1273
      : "+r" (pix1), "=r"(tmp) 
1274
      : "r" ((long)line_size) , "g" (h-2)
1275
      : "%ecx");
1276
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1277
}
1278

    
1279
static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1280
    int score1= sse16_mmx(c, pix1, pix2, line_size, h);
1281
    int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1282

    
1283
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
1284
    else  return score1 + ABS(score2)*8;
1285
}
1286

    
1287
static int nsse8_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1288
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1289
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1290

    
1291
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
1292
    else  return score1 + ABS(score2)*8;
1293
}
1294

    
1295
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1296
    int tmp;
1297
    
1298
    assert( (((int)pix) & 7) == 0);
1299
    assert((line_size &7) ==0);
1300
    
1301
#define SUM(in0, in1, out0, out1) \
1302
      "movq (%0), %%mm2\n"\
1303
      "movq 8(%0), %%mm3\n"\
1304
      "add %2,%0\n"\
1305
      "movq %%mm2, " #out0 "\n"\
1306
      "movq %%mm3, " #out1 "\n"\
1307
      "psubusb " #in0 ", %%mm2\n"\
1308
      "psubusb " #in1 ", %%mm3\n"\
1309
      "psubusb " #out0 ", " #in0 "\n"\
1310
      "psubusb " #out1 ", " #in1 "\n"\
1311
      "por %%mm2, " #in0 "\n"\
1312
      "por %%mm3, " #in1 "\n"\
1313
      "movq " #in0 ", %%mm2\n"\
1314
      "movq " #in1 ", %%mm3\n"\
1315
      "punpcklbw %%mm7, " #in0 "\n"\
1316
      "punpcklbw %%mm7, " #in1 "\n"\
1317
      "punpckhbw %%mm7, %%mm2\n"\
1318
      "punpckhbw %%mm7, %%mm3\n"\
1319
      "paddw " #in1 ", " #in0 "\n"\
1320
      "paddw %%mm3, %%mm2\n"\
1321
      "paddw %%mm2, " #in0 "\n"\
1322
      "paddw " #in0 ", %%mm6\n"
1323

    
1324
    
1325
  asm volatile (
1326
      "movl %3,%%ecx\n"
1327
      "pxor %%mm6,%%mm6\n"
1328
      "pxor %%mm7,%%mm7\n"
1329
      "movq (%0),%%mm0\n"
1330
      "movq 8(%0),%%mm1\n"
1331
      "add %2,%0\n"
1332
      "subl $2, %%ecx\n"
1333
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1334
      "1:\n"
1335
      
1336
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1337
      
1338
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1339
      
1340
      "subl $2, %%ecx\n"
1341
      "jnz 1b\n"
1342

    
1343
      "movq %%mm6,%%mm0\n"
1344
      "psrlq $32, %%mm6\n"
1345
      "paddw %%mm6,%%mm0\n"
1346
      "movq %%mm0,%%mm6\n"
1347
      "psrlq $16, %%mm0\n"
1348
      "paddw %%mm6,%%mm0\n"
1349
      "movd %%mm0,%1\n"
1350
      : "+r" (pix), "=r"(tmp) 
1351
      : "r" ((long)line_size) , "m" (h)
1352
      : "%ecx");
1353
    return tmp & 0xFFFF;
1354
}
1355
#undef SUM
1356

    
1357
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1358
    int tmp;
1359
    
1360
    assert( (((int)pix) & 7) == 0);
1361
    assert((line_size &7) ==0);
1362
    
1363
#define SUM(in0, in1, out0, out1) \
1364
      "movq (%0), " #out0 "\n"\
1365
      "movq 8(%0), " #out1 "\n"\
1366
      "add %2,%0\n"\
1367
      "psadbw " #out0 ", " #in0 "\n"\
1368
      "psadbw " #out1 ", " #in1 "\n"\
1369
      "paddw " #in1 ", " #in0 "\n"\
1370
      "paddw " #in0 ", %%mm6\n"
1371

    
1372
  asm volatile (
1373
      "movl %3,%%ecx\n"
1374
      "pxor %%mm6,%%mm6\n"
1375
      "pxor %%mm7,%%mm7\n"
1376
      "movq (%0),%%mm0\n"
1377
      "movq 8(%0),%%mm1\n"
1378
      "add %2,%0\n"
1379
      "subl $2, %%ecx\n"
1380
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1381
      "1:\n"
1382
      
1383
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1384
      
1385
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1386
      
1387
      "subl $2, %%ecx\n"
1388
      "jnz 1b\n"
1389

    
1390
      "movd %%mm6,%1\n"
1391
      : "+r" (pix), "=r"(tmp) 
1392
      : "r" ((long)line_size) , "m" (h)
1393
      : "%ecx");
1394
    return tmp;
1395
}
1396
#undef SUM
1397

    
1398
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1399
    int tmp;
1400
    
1401
    assert( (((int)pix1) & 7) == 0);
1402
    assert( (((int)pix2) & 7) == 0);
1403
    assert((line_size &7) ==0);
1404
    
1405
#define SUM(in0, in1, out0, out1) \
1406
      "movq (%0),%%mm2\n"\
1407
      "movq (%1)," #out0 "\n"\
1408
      "movq 8(%0),%%mm3\n"\
1409
      "movq 8(%1)," #out1 "\n"\
1410
      "add %3,%0\n"\
1411
      "add %3,%1\n"\
1412
      "psubb " #out0 ", %%mm2\n"\
1413
      "psubb " #out1 ", %%mm3\n"\
1414
      "pxor %%mm7, %%mm2\n"\
1415
      "pxor %%mm7, %%mm3\n"\
1416
      "movq %%mm2, " #out0 "\n"\
1417
      "movq %%mm3, " #out1 "\n"\
1418
      "psubusb " #in0 ", %%mm2\n"\
1419
      "psubusb " #in1 ", %%mm3\n"\
1420
      "psubusb " #out0 ", " #in0 "\n"\
1421
      "psubusb " #out1 ", " #in1 "\n"\
1422
      "por %%mm2, " #in0 "\n"\
1423
      "por %%mm3, " #in1 "\n"\
1424
      "movq " #in0 ", %%mm2\n"\
1425
      "movq " #in1 ", %%mm3\n"\
1426
      "punpcklbw %%mm7, " #in0 "\n"\
1427
      "punpcklbw %%mm7, " #in1 "\n"\
1428
      "punpckhbw %%mm7, %%mm2\n"\
1429
      "punpckhbw %%mm7, %%mm3\n"\
1430
      "paddw " #in1 ", " #in0 "\n"\
1431
      "paddw %%mm3, %%mm2\n"\
1432
      "paddw %%mm2, " #in0 "\n"\
1433
      "paddw " #in0 ", %%mm6\n"
1434

    
1435
    
1436
  asm volatile (
1437
      "movl %4,%%ecx\n"
1438
      "pxor %%mm6,%%mm6\n"
1439
      "pcmpeqw %%mm7,%%mm7\n"
1440
      "psllw $15, %%mm7\n"
1441
      "packsswb %%mm7, %%mm7\n"
1442
      "movq (%0),%%mm0\n"
1443
      "movq (%1),%%mm2\n"
1444
      "movq 8(%0),%%mm1\n"
1445
      "movq 8(%1),%%mm3\n"
1446
      "add %3,%0\n"
1447
      "add %3,%1\n"
1448
      "subl $2, %%ecx\n"
1449
      "psubb %%mm2, %%mm0\n"
1450
      "psubb %%mm3, %%mm1\n"
1451
      "pxor %%mm7, %%mm0\n"
1452
      "pxor %%mm7, %%mm1\n"
1453
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1454
      "1:\n"
1455
      
1456
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1457
      
1458
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1459
      
1460
      "subl $2, %%ecx\n"
1461
      "jnz 1b\n"
1462

    
1463
      "movq %%mm6,%%mm0\n"
1464
      "psrlq $32, %%mm6\n"
1465
      "paddw %%mm6,%%mm0\n"
1466
      "movq %%mm0,%%mm6\n"
1467
      "psrlq $16, %%mm0\n"
1468
      "paddw %%mm6,%%mm0\n"
1469
      "movd %%mm0,%2\n"
1470
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
1471
      : "r" ((long)line_size) , "m" (h)
1472
      : "%ecx");
1473
    return tmp & 0x7FFF;
1474
}
1475
#undef SUM
1476

    
1477
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1478
    int tmp;
1479
    
1480
    assert( (((int)pix1) & 7) == 0);
1481
    assert( (((int)pix2) & 7) == 0);
1482
    assert((line_size &7) ==0);
1483
    
1484
#define SUM(in0, in1, out0, out1) \
1485
      "movq (%0)," #out0 "\n"\
1486
      "movq (%1),%%mm2\n"\
1487
      "movq 8(%0)," #out1 "\n"\
1488
      "movq 8(%1),%%mm3\n"\
1489
      "add %3,%0\n"\
1490
      "add %3,%1\n"\
1491
      "psubb %%mm2, " #out0 "\n"\
1492
      "psubb %%mm3, " #out1 "\n"\
1493
      "pxor %%mm7, " #out0 "\n"\
1494
      "pxor %%mm7, " #out1 "\n"\
1495
      "psadbw " #out0 ", " #in0 "\n"\
1496
      "psadbw " #out1 ", " #in1 "\n"\
1497
      "paddw " #in1 ", " #in0 "\n"\
1498
      "paddw " #in0 ", %%mm6\n"
1499

    
1500
  asm volatile (
1501
      "movl %4,%%ecx\n"
1502
      "pxor %%mm6,%%mm6\n"
1503
      "pcmpeqw %%mm7,%%mm7\n"
1504
      "psllw $15, %%mm7\n"
1505
      "packsswb %%mm7, %%mm7\n"
1506
      "movq (%0),%%mm0\n"
1507
      "movq (%1),%%mm2\n"
1508
      "movq 8(%0),%%mm1\n"
1509
      "movq 8(%1),%%mm3\n"
1510
      "add %3,%0\n"
1511
      "add %3,%1\n"
1512
      "subl $2, %%ecx\n"
1513
      "psubb %%mm2, %%mm0\n"
1514
      "psubb %%mm3, %%mm1\n"
1515
      "pxor %%mm7, %%mm0\n"
1516
      "pxor %%mm7, %%mm1\n"
1517
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1518
      "1:\n"
1519
      
1520
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1521
      
1522
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1523
      
1524
      "subl $2, %%ecx\n"
1525
      "jnz 1b\n"
1526

    
1527
      "movd %%mm6,%2\n"
1528
      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
1529
      : "r" ((long)line_size) , "m" (h)
1530
      : "%ecx");
1531
    return tmp;
1532
}
1533
#undef SUM
1534

    
1535
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1536
    long i=0;
1537
    asm volatile(
1538
        "1:                                \n\t"
1539
        "movq  (%2, %0), %%mm0                \n\t"
1540
        "movq  (%1, %0), %%mm1                \n\t"
1541
        "psubb %%mm0, %%mm1                \n\t"
1542
        "movq %%mm1, (%3, %0)                \n\t"
1543
        "movq 8(%2, %0), %%mm0                \n\t"
1544
        "movq 8(%1, %0), %%mm1                \n\t"
1545
        "psubb %%mm0, %%mm1                \n\t"
1546
        "movq %%mm1, 8(%3, %0)                \n\t"
1547
        "add $16, %0                        \n\t"
1548
        "cmp %4, %0                        \n\t"
1549
        " jb 1b                                \n\t"
1550
        : "+r" (i)
1551
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1552
    );
1553
    for(; i<w; i++)
1554
        dst[i+0] = src1[i+0]-src2[i+0];
1555
}
1556

    
1557
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1558
    long i=0;
1559
    uint8_t l, lt;
1560
    
1561
    asm volatile(
1562
        "1:                                \n\t"
1563
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1564
        "movq  (%1, %0), %%mm1                \n\t" // T
1565
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1566
        "movq  (%2, %0), %%mm3                \n\t" // X
1567
        "movq %%mm2, %%mm4                \n\t" // L
1568
        "psubb %%mm0, %%mm2                \n\t"
1569
        "paddb %%mm1, %%mm2                \n\t" // L + T - LT
1570
        "movq %%mm4, %%mm5                \n\t" // L
1571
        "pmaxub %%mm1, %%mm4                \n\t" // max(T, L)
1572
        "pminub %%mm5, %%mm1                \n\t" // min(T, L)
1573
        "pminub %%mm2, %%mm4                \n\t" 
1574
        "pmaxub %%mm1, %%mm4                \n\t"
1575
        "psubb %%mm4, %%mm3                \n\t" // dst - pred
1576
        "movq %%mm3, (%3, %0)                \n\t"
1577
        "add $8, %0                        \n\t"
1578
        "cmp %4, %0                        \n\t"
1579
        " jb 1b                                \n\t"
1580
        : "+r" (i)
1581
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1582
    );
1583

    
1584
    l= *left;
1585
    lt= *left_top;
1586
    
1587
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1588
    
1589
    *left_top= src1[w-1];
1590
    *left    = src2[w-1];
1591
}
1592

    
1593
#define LBUTTERFLY2(a1,b1,a2,b2)\
1594
    "paddw " #b1 ", " #a1 "                \n\t"\
1595
    "paddw " #b2 ", " #a2 "                \n\t"\
1596
    "paddw " #b1 ", " #b1 "                \n\t"\
1597
    "paddw " #b2 ", " #b2 "                \n\t"\
1598
    "psubw " #a1 ", " #b1 "                \n\t"\
1599
    "psubw " #a2 ", " #b2 "                \n\t"
1600

    
1601
#define HADAMARD48\
1602
        LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
1603
        LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
1604
        LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
1605
        LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
1606
        LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
1607
        LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
1608

    
1609
#define MMABS(a,z)\
1610
    "pxor " #z ", " #z "                \n\t"\
1611
    "pcmpgtw " #a ", " #z "                \n\t"\
1612
    "pxor " #z ", " #a "                \n\t"\
1613
    "psubw " #z ", " #a "                \n\t"
1614

    
1615
#define MMABS_SUM(a,z, sum)\
1616
    "pxor " #z ", " #z "                \n\t"\
1617
    "pcmpgtw " #a ", " #z "                \n\t"\
1618
    "pxor " #z ", " #a "                \n\t"\
1619
    "psubw " #z ", " #a "                \n\t"\
1620
    "paddusw " #a ", " #sum "                \n\t"
1621

    
1622
#define MMABS_MMX2(a,z)\
1623
    "pxor " #z ", " #z "                \n\t"\
1624
    "psubw " #a ", " #z "                \n\t"\
1625
    "pmaxsw " #z ", " #a "                \n\t"
1626

    
1627
#define MMABS_SUM_MMX2(a,z, sum)\
1628
    "pxor " #z ", " #z "                \n\t"\
1629
    "psubw " #a ", " #z "                \n\t"\
1630
    "pmaxsw " #z ", " #a "                \n\t"\
1631
    "paddusw " #a ", " #sum "                \n\t"
1632
        
1633
#define SBUTTERFLY(a,b,t,n)\
1634
    "movq " #a ", " #t "                \n\t" /* abcd */\
1635
    "punpckl" #n " " #b ", " #a "        \n\t" /* aebf */\
1636
    "punpckh" #n " " #b ", " #t "        \n\t" /* cgdh */\
1637

    
1638
#define TRANSPOSE4(a,b,c,d,t)\
1639
    SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
1640
    SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
1641
    SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
1642
    SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
1643

    
1644
#define LOAD4(o, a, b, c, d)\
1645
        "movq "#o"(%1), " #a "                \n\t"\
1646
        "movq "#o"+16(%1), " #b "        \n\t"\
1647
        "movq "#o"+32(%1), " #c "        \n\t"\
1648
        "movq "#o"+48(%1), " #d "        \n\t"
1649

    
1650
#define STORE4(o, a, b, c, d)\
1651
        "movq "#a", "#o"(%1)                \n\t"\
1652
        "movq "#b", "#o"+16(%1)                \n\t"\
1653
        "movq "#c", "#o"+32(%1)                \n\t"\
1654
        "movq "#d", "#o"+48(%1)                \n\t"\
1655

    
1656
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1657
    uint64_t temp[16] __align8;
1658
    int sum=0;
1659
    
1660
    assert(h==8);
1661

    
1662
    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1663

    
1664
    asm volatile(
1665
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1666
        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1667
        
1668
        HADAMARD48
1669
        
1670
        "movq %%mm7, 112(%1)                \n\t"
1671
        
1672
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1673
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1674
        
1675
        "movq 112(%1), %%mm7                 \n\t"
1676
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1677
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1678

    
1679
        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1680
        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1681
        
1682
        HADAMARD48
1683
        
1684
        "movq %%mm7, 120(%1)                \n\t"
1685
        
1686
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1687
        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1688
        
1689
        "movq 120(%1), %%mm7                 \n\t"
1690
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1691
        "movq %%mm7, %%mm5                \n\t"//FIXME remove
1692
        "movq %%mm6, %%mm7                \n\t"
1693
        "movq %%mm0, %%mm6                \n\t"
1694
//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1695
        
1696
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1697
//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1698
        
1699
        HADAMARD48
1700
        "movq %%mm7, 64(%1)                \n\t"
1701
        MMABS(%%mm0, %%mm7)
1702
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1703
        MMABS_SUM(%%mm2, %%mm7, %%mm0)
1704
        MMABS_SUM(%%mm3, %%mm7, %%mm0)
1705
        MMABS_SUM(%%mm4, %%mm7, %%mm0)
1706
        MMABS_SUM(%%mm5, %%mm7, %%mm0)
1707
        MMABS_SUM(%%mm6, %%mm7, %%mm0)
1708
        "movq 64(%1), %%mm1                \n\t"
1709
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1710
        "movq %%mm0, 64(%1)                \n\t"
1711
        
1712
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1713
        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1714
        
1715
        HADAMARD48
1716
        "movq %%mm7, (%1)                \n\t"
1717
        MMABS(%%mm0, %%mm7)
1718
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1719
        MMABS_SUM(%%mm2, %%mm7, %%mm0)
1720
        MMABS_SUM(%%mm3, %%mm7, %%mm0)
1721
        MMABS_SUM(%%mm4, %%mm7, %%mm0)
1722
        MMABS_SUM(%%mm5, %%mm7, %%mm0)
1723
        MMABS_SUM(%%mm6, %%mm7, %%mm0)
1724
        "movq (%1), %%mm1                \n\t"
1725
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1726
        "movq 64(%1), %%mm1                \n\t"
1727
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1728
        
1729
        "movq %%mm0, %%mm1                \n\t"
1730
        "psrlq $32, %%mm0                \n\t"
1731
        "paddusw %%mm1, %%mm0                \n\t"
1732
        "movq %%mm0, %%mm1                \n\t"
1733
        "psrlq $16, %%mm0                \n\t"
1734
        "paddusw %%mm1, %%mm0                \n\t"
1735
        "movd %%mm0, %0                        \n\t"
1736
                
1737
        : "=r" (sum)
1738
        : "r"(temp)
1739
    );
1740
    return sum&0xFFFF;
1741
}
1742

    
1743
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1744
    uint64_t temp[16] __align8;
1745
    int sum=0;
1746
    
1747
    assert(h==8);
1748

    
1749
    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1750

    
1751
    asm volatile(
1752
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1753
        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1754
        
1755
        HADAMARD48
1756
        
1757
        "movq %%mm7, 112(%1)                \n\t"
1758
        
1759
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1760
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1761
        
1762
        "movq 112(%1), %%mm7                 \n\t"
1763
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1764
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1765

    
1766
        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1767
        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1768
        
1769
        HADAMARD48
1770
        
1771
        "movq %%mm7, 120(%1)                \n\t"
1772
        
1773
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1774
        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1775
        
1776
        "movq 120(%1), %%mm7                 \n\t"
1777
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1778
        "movq %%mm7, %%mm5                \n\t"//FIXME remove
1779
        "movq %%mm6, %%mm7                \n\t"
1780
        "movq %%mm0, %%mm6                \n\t"
1781
//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1782
        
1783
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1784
//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1785
        
1786
        HADAMARD48
1787
        "movq %%mm7, 64(%1)                \n\t"
1788
        MMABS_MMX2(%%mm0, %%mm7)
1789
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1790
        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1791
        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1792
        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1793
        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1794
        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1795
        "movq 64(%1), %%mm1                \n\t"
1796
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1797
        "movq %%mm0, 64(%1)                \n\t"
1798
        
1799
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1800
        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1801
        
1802
        HADAMARD48
1803
        "movq %%mm7, (%1)                \n\t"
1804
        MMABS_MMX2(%%mm0, %%mm7)
1805
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1806
        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1807
        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1808
        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1809
        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1810
        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1811
        "movq (%1), %%mm1                \n\t"
1812
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1813
        "movq 64(%1), %%mm1                \n\t"
1814
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1815
        
1816
        "movq %%mm0, %%mm1                \n\t"
1817
        "psrlq $32, %%mm0                \n\t"
1818
        "paddusw %%mm1, %%mm0                \n\t"
1819
        "movq %%mm0, %%mm1                \n\t"
1820
        "psrlq $16, %%mm0                \n\t"
1821
        "paddusw %%mm1, %%mm0                \n\t"
1822
        "movd %%mm0, %0                        \n\t"
1823
                
1824
        : "=r" (sum)
1825
        : "r"(temp)
1826
    );
1827
    return sum&0xFFFF;
1828
}
1829

    
1830

    
1831
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
1832
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
1833
#endif //CONFIG_ENCODERS
1834

    
1835
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1836
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1837

    
1838
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1839
        "paddw " #m4 ", " #m3 "                \n\t" /* x1 */\
1840
        "movq "MANGLE(ff_pw_20)", %%mm4                \n\t" /* 20 */\
1841
        "pmullw " #m3 ", %%mm4                \n\t" /* 20x1 */\
1842
        "movq "#in7", " #m3 "                \n\t" /* d */\
1843
        "movq "#in0", %%mm5                \n\t" /* D */\
1844
        "paddw " #m3 ", %%mm5                \n\t" /* x4 */\
1845
        "psubw %%mm5, %%mm4                \n\t" /* 20x1 - x4 */\
1846
        "movq "#in1", %%mm5                \n\t" /* C */\
1847
        "movq "#in2", %%mm6                \n\t" /* B */\
1848
        "paddw " #m6 ", %%mm5                \n\t" /* x3 */\
1849
        "paddw " #m5 ", %%mm6                \n\t" /* x2 */\
1850
        "paddw %%mm6, %%mm6                \n\t" /* 2x2 */\
1851
        "psubw %%mm6, %%mm5                \n\t" /* -2x2 + x3 */\
1852
        "pmullw "MANGLE(ff_pw_3)", %%mm5        \n\t" /* -6x2 + 3x3 */\
1853
        "paddw " #rnd ", %%mm4                \n\t" /* x2 */\
1854
        "paddw %%mm4, %%mm5                \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1855
        "psraw $5, %%mm5                \n\t"\
1856
        "packuswb %%mm5, %%mm5                \n\t"\
1857
        OP(%%mm5, out, %%mm7, d)
1858

    
1859
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1860
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1861
    uint64_t temp;\
1862
\
1863
    asm volatile(\
1864
        "pxor %%mm7, %%mm7                \n\t"\
1865
        "1:                                \n\t"\
1866
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1867
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1868
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1869
        "punpcklbw %%mm7, %%mm0                \n\t" /* 0A0B0C0D */\
1870
        "punpckhbw %%mm7, %%mm1                \n\t" /* 0E0F0G0H */\
1871
        "pshufw $0x90, %%mm0, %%mm5        \n\t" /* 0A0A0B0C */\
1872
        "pshufw $0x41, %%mm0, %%mm6        \n\t" /* 0B0A0A0B */\
1873
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1874
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1875
        "psllq $8, %%mm2                \n\t" /* 0ABCDEFG */\
1876
        "psllq $16, %%mm3                \n\t" /* 00ABCDEF */\
1877
        "psllq $24, %%mm4                \n\t" /* 000ABCDE */\
1878
        "punpckhbw %%mm7, %%mm2                \n\t" /* 0D0E0F0G */\
1879
        "punpckhbw %%mm7, %%mm3                \n\t" /* 0C0D0E0F */\
1880
        "punpckhbw %%mm7, %%mm4                \n\t" /* 0B0C0D0E */\
1881
        "paddw %%mm3, %%mm5                \n\t" /* b */\
1882
        "paddw %%mm2, %%mm6                \n\t" /* c */\
1883
        "paddw %%mm5, %%mm5                \n\t" /* 2b */\
1884
        "psubw %%mm5, %%mm6                \n\t" /* c - 2b */\
1885
        "pshufw $0x06, %%mm0, %%mm5        \n\t" /* 0C0B0A0A */\
1886
        "pmullw "MANGLE(ff_pw_3)", %%mm6                \n\t" /* 3c - 6b */\
1887
        "paddw %%mm4, %%mm0                \n\t" /* a */\
1888
        "paddw %%mm1, %%mm5                \n\t" /* d */\
1889
        "pmullw "MANGLE(ff_pw_20)", %%mm0                \n\t" /* 20a */\
1890
        "psubw %%mm5, %%mm0                \n\t" /* 20a - d */\
1891
        "paddw %6, %%mm6                \n\t"\
1892
        "paddw %%mm6, %%mm0                \n\t" /* 20a - 6b + 3c - d */\
1893
        "psraw $5, %%mm0                \n\t"\
1894
        "movq %%mm0, %5                        \n\t"\
1895
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1896
        \
1897
        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
1898
        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
1899
        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
1900
        "psrlq $8, %%mm0                \n\t" /* GHIJKLM0 */\
1901
        "psrlq $16, %%mm5                \n\t" /* HIJKLM00 */\
1902
        "punpcklbw %%mm7, %%mm0                \n\t" /* 0G0H0I0J */\
1903
        "punpcklbw %%mm7, %%mm5                \n\t" /* 0H0I0J0K */\
1904
        "paddw %%mm0, %%mm2                \n\t" /* b */\
1905
        "paddw %%mm5, %%mm3                \n\t" /* c */\
1906
        "paddw %%mm2, %%mm2                \n\t" /* 2b */\
1907
        "psubw %%mm2, %%mm3                \n\t" /* c - 2b */\
1908
        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
1909
        "psrlq $24, %%mm6                \n\t" /* IJKLM000 */\
1910
        "punpcklbw %%mm7, %%mm2                \n\t" /* 0F0G0H0I */\
1911
        "punpcklbw %%mm7, %%mm6                \n\t" /* 0I0J0K0L */\
1912
        "pmullw "MANGLE(ff_pw_3)", %%mm3                \n\t" /* 3c - 6b */\
1913
        "paddw %%mm2, %%mm1                \n\t" /* a */\
1914
        "paddw %%mm6, %%mm4                \n\t" /* d */\
1915
        "pmullw "MANGLE(ff_pw_20)", %%mm1                \n\t" /* 20a */\
1916
        "psubw %%mm4, %%mm3                \n\t" /* - 6b +3c - d */\
1917
        "paddw %6, %%mm1                \n\t"\
1918
        "paddw %%mm1, %%mm3                \n\t" /* 20a - 6b +3c - d */\
1919
        "psraw $5, %%mm3                \n\t"\
1920
        "movq %5, %%mm1                        \n\t"\
1921
        "packuswb %%mm3, %%mm1                \n\t"\
1922
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
1923
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1924
        \
1925
        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
1926
        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
1927
        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
1928
        "psrlq $8, %%mm1                \n\t" /* KLMNOPQ0 */\
1929
        "psrlq $16, %%mm4                \n\t" /* LMNOPQ00 */\
1930
        "punpcklbw %%mm7, %%mm1                \n\t" /* 0K0L0M0N */\
1931
        "punpcklbw %%mm7, %%mm4                \n\t" /* 0L0M0N0O */\
1932
        "paddw %%mm1, %%mm5                \n\t" /* b */\
1933
        "paddw %%mm4, %%mm0                \n\t" /* c */\
1934
        "paddw %%mm5, %%mm5                \n\t" /* 2b */\
1935
        "psubw %%mm5, %%mm0                \n\t" /* c - 2b */\
1936
        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
1937
        "psrlq $24, %%mm3                \n\t" /* MNOPQ000 */\
1938
        "pmullw "MANGLE(ff_pw_3)", %%mm0                \n\t" /* 3c - 6b */\
1939
        "punpcklbw %%mm7, %%mm3                \n\t" /* 0M0N0O0P */\
1940
        "paddw %%mm3, %%mm2                \n\t" /* d */\
1941
        "psubw %%mm2, %%mm0                \n\t" /* -6b + 3c - d */\
1942
        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
1943
        "punpcklbw %%mm7, %%mm2                \n\t" /* 0J0K0L0M */\
1944
        "punpckhbw %%mm7, %%mm5                \n\t" /* 0N0O0P0Q */\
1945
        "paddw %%mm2, %%mm6                \n\t" /* a */\
1946
        "pmullw "MANGLE(ff_pw_20)", %%mm6                \n\t" /* 20a */\
1947
        "paddw %6, %%mm0                \n\t"\
1948
        "paddw %%mm6, %%mm0                \n\t" /* 20a - 6b + 3c - d */\
1949
        "psraw $5, %%mm0                \n\t"\
1950
        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1951
        \
1952
        "paddw %%mm5, %%mm3                \n\t" /* a */\
1953
        "pshufw $0xF9, %%mm5, %%mm6        \n\t" /* 0O0P0Q0Q */\
1954
        "paddw %%mm4, %%mm6                \n\t" /* b */\
1955
        "pshufw $0xBE, %%mm5, %%mm4        \n\t" /* 0P0Q0Q0P */\
1956
        "pshufw $0x6F, %%mm5, %%mm5        \n\t" /* 0Q0Q0P0O */\
1957
        "paddw %%mm1, %%mm4                \n\t" /* c */\
1958
        "paddw %%mm2, %%mm5                \n\t" /* d */\
1959
        "paddw %%mm6, %%mm6                \n\t" /* 2b */\
1960
        "psubw %%mm6, %%mm4                \n\t" /* c - 2b */\
1961
        "pmullw "MANGLE(ff_pw_20)", %%mm3                \n\t" /* 20a */\
1962
        "pmullw "MANGLE(ff_pw_3)", %%mm4                \n\t" /* 3c - 6b */\
1963
        "psubw %%mm5, %%mm3                \n\t" /* -6b + 3c - d */\
1964
        "paddw %6, %%mm4                \n\t"\
1965
        "paddw %%mm3, %%mm4                \n\t" /* 20a - 6b + 3c - d */\
1966
        "psraw $5, %%mm4                \n\t"\
1967
        "packuswb %%mm4, %%mm0                \n\t"\
1968
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1969
        \
1970
        "add %3, %0                        \n\t"\
1971
        "add %4, %1                        \n\t"\
1972
        "decl %2                        \n\t"\
1973
        " jnz 1b                                \n\t"\
1974
        : "+a"(src), "+c"(dst), "+m"(h)\
1975
        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1976
        : "memory"\
1977
    );\
1978
}\
1979
\
1980
static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1981
    int i;\
1982
    int16_t temp[16];\
1983
    /* quick HACK, XXX FIXME MUST be optimized */\
1984
    for(i=0; i<h; i++)\
1985
    {\
1986
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1987
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1988
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1989
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1990
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1991
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1992
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1993
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1994
        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1995
        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1996
        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1997
        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1998
        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1999
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2000
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2001
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2002
        asm volatile(\
2003
            "movq (%0), %%mm0                \n\t"\
2004
            "movq 8(%0), %%mm1                \n\t"\
2005
            "paddw %2, %%mm0                \n\t"\
2006
            "paddw %2, %%mm1                \n\t"\
2007
            "psraw $5, %%mm0                \n\t"\
2008
            "psraw $5, %%mm1                \n\t"\
2009
            "packuswb %%mm1, %%mm0        \n\t"\
2010
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2011
            "movq 16(%0), %%mm0                \n\t"\
2012
            "movq 24(%0), %%mm1                \n\t"\
2013
            "paddw %2, %%mm0                \n\t"\
2014
            "paddw %2, %%mm1                \n\t"\
2015
            "psraw $5, %%mm0                \n\t"\
2016
            "psraw $5, %%mm1                \n\t"\
2017
            "packuswb %%mm1, %%mm0        \n\t"\
2018
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2019
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2020
            : "memory"\
2021
        );\
2022
        dst+=dstStride;\
2023
        src+=srcStride;\
2024
    }\
2025
}\
2026
\
2027
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2028
    uint64_t temp;\
2029
\
2030
    asm volatile(\
2031
        "pxor %%mm7, %%mm7                \n\t"\
2032
        "1:                                \n\t"\
2033
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
2034
        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
2035
        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
2036
        "punpcklbw %%mm7, %%mm0                \n\t" /* 0A0B0C0D */\
2037
        "punpckhbw %%mm7, %%mm1                \n\t" /* 0E0F0G0H */\
2038
        "pshufw $0x90, %%mm0, %%mm5        \n\t" /* 0A0A0B0C */\
2039
        "pshufw $0x41, %%mm0, %%mm6        \n\t" /* 0B0A0A0B */\
2040
        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
2041
        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
2042
        "psllq $8, %%mm2                \n\t" /* 0ABCDEFG */\
2043
        "psllq $16, %%mm3                \n\t" /* 00ABCDEF */\
2044
        "psllq $24, %%mm4                \n\t" /* 000ABCDE */\
2045
        "punpckhbw %%mm7, %%mm2                \n\t" /* 0D0E0F0G */\
2046
        "punpckhbw %%mm7, %%mm3                \n\t" /* 0C0D0E0F */\
2047
        "punpckhbw %%mm7, %%mm4                \n\t" /* 0B0C0D0E */\
2048
        "paddw %%mm3, %%mm5                \n\t" /* b */\
2049
        "paddw %%mm2, %%mm6                \n\t" /* c */\
2050
        "paddw %%mm5, %%mm5                \n\t" /* 2b */\
2051
        "psubw %%mm5, %%mm6                \n\t" /* c - 2b */\
2052
        "pshufw $0x06, %%mm0, %%mm5        \n\t" /* 0C0B0A0A */\
2053
        "pmullw "MANGLE(ff_pw_3)", %%mm6                \n\t" /* 3c - 6b */\
2054
        "paddw %%mm4, %%mm0                \n\t" /* a */\
2055
        "paddw %%mm1, %%mm5                \n\t" /* d */\
2056
        "pmullw "MANGLE(ff_pw_20)", %%mm0                \n\t" /* 20a */\
2057
        "psubw %%mm5, %%mm0                \n\t" /* 20a - d */\
2058
        "paddw %6, %%mm6                \n\t"\
2059
        "paddw %%mm6, %%mm0                \n\t" /* 20a - 6b + 3c - d */\
2060
        "psraw $5, %%mm0                \n\t"\
2061
        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2062
        \
2063
        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
2064
        "punpcklbw %%mm7, %%mm5                \n\t" /* 0F0G0H0I */\
2065
        "pshufw $0xF9, %%mm5, %%mm6        \n\t" /* 0G0H0I0I */\
2066
        "paddw %%mm5, %%mm1                \n\t" /* a */\
2067
        "paddw %%mm6, %%mm2                \n\t" /* b */\
2068
        "pshufw $0xBE, %%mm5, %%mm6        \n\t" /* 0H0I0I0H */\
2069
        "pshufw $0x6F, %%mm5, %%mm5        \n\t" /* 0I0I0H0G */\
2070
        "paddw %%mm6, %%mm3                \n\t" /* c */\
2071
        "paddw %%mm5, %%mm4                \n\t" /* d */\
2072
        "paddw %%mm2, %%mm2                \n\t" /* 2b */\
2073
        "psubw %%mm2, %%mm3                \n\t" /* c - 2b */\
2074
        "pmullw "MANGLE(ff_pw_20)", %%mm1                \n\t" /* 20a */\
2075
        "pmullw "MANGLE(ff_pw_3)", %%mm3                \n\t" /* 3c - 6b */\
2076
        "psubw %%mm4, %%mm3                \n\t" /* -6b + 3c - d */\
2077
        "paddw %6, %%mm1                \n\t"\
2078
        "paddw %%mm1, %%mm3                \n\t" /* 20a - 6b + 3c - d */\
2079
        "psraw $5, %%mm3                \n\t"\
2080
        "packuswb %%mm3, %%mm0                \n\t"\
2081
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
2082
        \
2083
        "add %3, %0                        \n\t"\
2084
        "add %4, %1                        \n\t"\
2085
        "decl %2                        \n\t"\
2086
        " jnz 1b                        \n\t"\
2087
        : "+a"(src), "+c"(dst), "+m"(h)\
2088
        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2089
        : "memory"\
2090
    );\
2091
}\
2092
\
2093
static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2094
    int i;\
2095
    int16_t temp[8];\
2096
    /* quick HACK, XXX FIXME MUST be optimized */\
2097
    for(i=0; i<h; i++)\
2098
    {\
2099
        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2100
        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2101
        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2102
        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2103
        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2104
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2105
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2106
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2107
        asm volatile(\
2108
            "movq (%0), %%mm0                \n\t"\
2109
            "movq 8(%0), %%mm1                \n\t"\
2110
            "paddw %2, %%mm0                \n\t"\
2111
            "paddw %2, %%mm1                \n\t"\
2112
            "psraw $5, %%mm0                \n\t"\
2113
            "psraw $5, %%mm1                \n\t"\
2114
            "packuswb %%mm1, %%mm0        \n\t"\
2115
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2116
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2117
            :"memory"\
2118
        );\
2119
        dst+=dstStride;\
2120
        src+=srcStride;\
2121
    }\
2122
}
2123

    
2124
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2125
\
2126
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2127
    uint64_t temp[17*4];\
2128
    uint64_t *temp_ptr= temp;\
2129
    int count= 17;\
2130
\
2131
    /*FIXME unroll */\
2132
    asm volatile(\
2133
        "pxor %%mm7, %%mm7                \n\t"\
2134
        "1:                                \n\t"\
2135
        "movq (%0), %%mm0                \n\t"\
2136
        "movq (%0), %%mm1                \n\t"\
2137
        "movq 8(%0), %%mm2                \n\t"\
2138
        "movq 8(%0), %%mm3                \n\t"\
2139
        "punpcklbw %%mm7, %%mm0                \n\t"\
2140
        "punpckhbw %%mm7, %%mm1                \n\t"\
2141
        "punpcklbw %%mm7, %%mm2                \n\t"\
2142
        "punpckhbw %%mm7, %%mm3                \n\t"\
2143
        "movq %%mm0, (%1)                \n\t"\
2144
        "movq %%mm1, 17*8(%1)                \n\t"\
2145
        "movq %%mm2, 2*17*8(%1)                \n\t"\
2146
        "movq %%mm3, 3*17*8(%1)                \n\t"\
2147
        "add $8, %1                        \n\t"\
2148
        "add %3, %0                        \n\t"\
2149
        "decl %2                        \n\t"\
2150
        " jnz 1b                        \n\t"\
2151
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2152
        : "r" ((long)srcStride)\
2153
        : "memory"\
2154
    );\
2155
    \
2156
    temp_ptr= temp;\
2157
    count=4;\
2158
    \
2159
/*FIXME reorder for speed */\
2160
    asm volatile(\
2161
        /*"pxor %%mm7, %%mm7                \n\t"*/\
2162
        "1:                                \n\t"\
2163
        "movq (%0), %%mm0                \n\t"\
2164
        "movq 8(%0), %%mm1                \n\t"\
2165
        "movq 16(%0), %%mm2                \n\t"\
2166
        "movq 24(%0), %%mm3                \n\t"\
2167
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2168
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2169
        "add %4, %1                        \n\t"\
2170
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2171
        \
2172
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2173
        "add %4, %1                        \n\t"\
2174
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2175
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2176
        "add %4, %1                        \n\t"\
2177
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2178
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2179
        "add %4, %1                        \n\t"\
2180
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2181
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2182
        "add %4, %1                        \n\t"\
2183
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2184
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2185
        "add %4, %1                        \n\t"\
2186
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2187
        \
2188
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2189
        "add %4, %1                        \n\t"  \
2190
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2191
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2192
        \
2193
        "add $136, %0                        \n\t"\
2194
        "add %6, %1                        \n\t"\
2195
        "decl %2                        \n\t"\
2196
        " jnz 1b                        \n\t"\
2197
        \
2198
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2199
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2200
        :"memory"\
2201
    );\
2202
}\
2203
\
2204
static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205
    uint64_t temp[9*2];\
2206
    uint64_t *temp_ptr= temp;\
2207
    int count= 9;\
2208
\
2209
    /*FIXME unroll */\
2210
    asm volatile(\
2211
        "pxor %%mm7, %%mm7                \n\t"\
2212
        "1:                                \n\t"\
2213
        "movq (%0), %%mm0                \n\t"\
2214
        "movq (%0), %%mm1                \n\t"\
2215
        "punpcklbw %%mm7, %%mm0                \n\t"\
2216
        "punpckhbw %%mm7, %%mm1                \n\t"\
2217
        "movq %%mm0, (%1)                \n\t"\
2218
        "movq %%mm1, 9*8(%1)                \n\t"\
2219
        "add $8, %1                        \n\t"\
2220
        "add %3, %0                        \n\t"\
2221
        "decl %2                        \n\t"\
2222
        " jnz 1b                        \n\t"\
2223
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2224
        : "r" ((long)srcStride)\
2225
        : "memory"\
2226
    );\
2227
    \
2228
    temp_ptr= temp;\
2229
    count=2;\
2230
    \
2231
/*FIXME reorder for speed */\
2232
    asm volatile(\
2233
        /*"pxor %%mm7, %%mm7                \n\t"*/\
2234
        "1:                                \n\t"\
2235
        "movq (%0), %%mm0                \n\t"\
2236
        "movq 8(%0), %%mm1                \n\t"\
2237
        "movq 16(%0), %%mm2                \n\t"\
2238
        "movq 24(%0), %%mm3                \n\t"\
2239
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
2240
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
2241
        "add %4, %1                        \n\t"\
2242
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
2243
        \
2244
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2245
        "add %4, %1                        \n\t"\
2246
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2247
        \
2248
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2249
        "add %4, %1                        \n\t"\
2250
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2251
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2252
                \
2253
        "add $72, %0                        \n\t"\
2254
        "add %6, %1                        \n\t"\
2255
        "decl %2                        \n\t"\
2256
        " jnz 1b                        \n\t"\
2257
         \
2258
        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2259
        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2260
        : "memory"\
2261
   );\
2262
}\
2263
\
2264
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2265
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2266
}\
2267
\
2268
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2269
    uint64_t temp[8];\
2270
    uint8_t * const half= (uint8_t*)temp;\
2271
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2272
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2273
}\
2274
\
2275
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2276
    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2277
}\
2278
\
2279
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2280
    uint64_t temp[8];\
2281
    uint8_t * const half= (uint8_t*)temp;\
2282
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2283
    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2284
}\
2285
\
2286
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2287
    uint64_t temp[8];\
2288
    uint8_t * const half= (uint8_t*)temp;\
2289
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2290
    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2291
}\
2292
\
2293
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2294
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2295
}\
2296
\
2297
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2298
    uint64_t temp[8];\
2299
    uint8_t * const half= (uint8_t*)temp;\
2300
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2301
    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2302
}\
2303
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2304
    uint64_t half[8 + 9];\
2305
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2306
    uint8_t * const halfHV= ((uint8_t*)half);\
2307
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2308
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2309
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2310
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2311
}\
2312
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2313
    uint64_t half[8 + 9];\
2314
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2315
    uint8_t * const halfHV= ((uint8_t*)half);\
2316
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2317
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2318
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2319
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2320
}\
2321
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2322
    uint64_t half[8 + 9];\
2323
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2324
    uint8_t * const halfHV= ((uint8_t*)half);\
2325
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2326
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2327
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2328
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2329
}\
2330
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2331
    uint64_t half[8 + 9];\
2332
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2333
    uint8_t * const halfHV= ((uint8_t*)half);\
2334
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2335
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2336
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2337
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2338
}\
2339
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2340
    uint64_t half[8 + 9];\
2341
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2342
    uint8_t * const halfHV= ((uint8_t*)half);\
2343
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2344
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2345
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2346
}\
2347
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2348
    uint64_t half[8 + 9];\
2349
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
2350
    uint8_t * const halfHV= ((uint8_t*)half);\
2351
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2352
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2353
    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2354
}\
2355
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2356
    uint64_t half[8 + 9];\
2357
    uint8_t * const halfH= ((uint8_t*)half);\
2358
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2359
    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2360
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2361
}\
2362
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2363
    uint64_t half[8 + 9];\
2364
    uint8_t * const halfH= ((uint8_t*)half);\
2365
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2366
    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2367
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2368
}\
2369
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2370
    uint64_t half[9];\
2371
    uint8_t * const halfH= ((uint8_t*)half);\
2372
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2373
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2374
}\
2375
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2376
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2377
}\
2378
\
2379
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2380
    uint64_t temp[32];\
2381
    uint8_t * const half= (uint8_t*)temp;\
2382
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2383
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2384
}\
2385
\
2386
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2387
    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2388
}\
2389
\
2390
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2391
    uint64_t temp[32];\
2392
    uint8_t * const half= (uint8_t*)temp;\
2393
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2394
    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2395
}\
2396
\
2397
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2398
    uint64_t temp[32];\
2399
    uint8_t * const half= (uint8_t*)temp;\
2400
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2401
    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2402
}\
2403
\
2404
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2405
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2406
}\
2407
\
2408
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2409
    uint64_t temp[32];\
2410
    uint8_t * const half= (uint8_t*)temp;\
2411
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2412
    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2413
}\
2414
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2415
    uint64_t half[16*2 + 17*2];\
2416
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2417
    uint8_t * const halfHV= ((uint8_t*)half);\
2418
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2419
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2420
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2421
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2422
}\
2423
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2424
    uint64_t half[16*2 + 17*2];\
2425
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2426
    uint8_t * const halfHV= ((uint8_t*)half);\
2427
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2428
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2429
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2430
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2431
}\
2432
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2433
    uint64_t half[16*2 + 17*2];\
2434
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2435
    uint8_t * const halfHV= ((uint8_t*)half);\
2436
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2437
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2438
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2439
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2440
}\
2441
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2442
    uint64_t half[16*2 + 17*2];\
2443
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2444
    uint8_t * const halfHV= ((uint8_t*)half);\
2445
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2446
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2447
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2448
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2449
}\
2450
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2451
    uint64_t half[16*2 + 17*2];\
2452
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2453
    uint8_t * const halfHV= ((uint8_t*)half);\
2454
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2455
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2456
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2457
}\
2458
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2459
    uint64_t half[16*2 + 17*2];\
2460
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
2461
    uint8_t * const halfHV= ((uint8_t*)half);\
2462
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2463
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2464
    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2465
}\
2466
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2467
    uint64_t half[17*2];\
2468
    uint8_t * const halfH= ((uint8_t*)half);\
2469
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2470
    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2471
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2472
}\
2473
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2474
    uint64_t half[17*2];\
2475
    uint8_t * const halfH= ((uint8_t*)half);\
2476
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2477
    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2478
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2479
}\
2480
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2481
    uint64_t half[17*2];\
2482
    uint8_t * const halfH= ((uint8_t*)half);\
2483
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2484
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2485
}
2486

    
2487
#define QPEL_H264V(A,B,C,D,E,F,OP)\
2488
        "movd (%0), "#F"                \n\t"\
2489
        "movq "#C", %%mm6                \n\t"\
2490
        "paddw "#D", %%mm6                \n\t"\
2491
        "psllw $2, %%mm6                \n\t"\
2492
        "psubw "#B", %%mm6                \n\t"\
2493
        "psubw "#E", %%mm6                \n\t"\
2494
        "pmullw %4, %%mm6                \n\t"\
2495
        "add %2, %0                        \n\t"\
2496
        "punpcklbw %%mm7, "#F"                \n\t"\
2497
        "paddw %5, "#A"                        \n\t"\
2498
        "paddw "#F", "#A"                \n\t"\
2499
        "paddw "#A", %%mm6                \n\t"\
2500
        "psraw $5, %%mm6                \n\t"\
2501
        "packuswb %%mm6, %%mm6                \n\t"\
2502
        OP(%%mm6, (%1), A, d)\
2503
        "add %3, %1                        \n\t"     
2504

    
2505
#define QPEL_H264HV(A,B,C,D,E,F,OF)\
2506
        "movd (%0), "#F"                \n\t"\
2507
        "movq "#C", %%mm6                \n\t"\
2508
        "paddw "#D", %%mm6                \n\t"\
2509
        "psllw $2, %%mm6                \n\t"\
2510
        "psubw "#B", %%mm6                \n\t"\
2511
        "psubw "#E", %%mm6                \n\t"\
2512
        "pmullw %3, %%mm6                \n\t"\
2513
        "add %2, %0                        \n\t"\
2514
        "punpcklbw %%mm7, "#F"                \n\t"\
2515
        "paddw "#F", "#A"                \n\t"\
2516
        "paddw "#A", %%mm6                \n\t"\
2517
        "movq %%mm6, "#OF"(%1)                \n\t"
2518
        
2519
#define QPEL_H264(OPNAME, OP, MMX)\
2520
static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2521
    int h=4;\
2522
\
2523
    asm volatile(\
2524
        "pxor %%mm7, %%mm7                \n\t"\
2525
        "movq %5, %%mm4                        \n\t"\
2526
        "movq %6, %%mm5                        \n\t"\
2527
        "1:                                \n\t"\
2528
        "movd  -1(%0), %%mm1                \n\t"\
2529
        "movd    (%0), %%mm2                \n\t"\
2530
        "movd   1(%0), %%mm3                \n\t"\
2531
        "movd   2(%0), %%mm0                \n\t"\
2532
        "punpcklbw %%mm7, %%mm1                \n\t"\
2533
        "punpcklbw %%mm7, %%mm2                \n\t"\
2534
        "punpcklbw %%mm7, %%mm3                \n\t"\
2535
        "punpcklbw %%mm7, %%mm0                \n\t"\
2536
        "paddw %%mm0, %%mm1                \n\t"\
2537
        "paddw %%mm3, %%mm2                \n\t"\
2538
        "movd  -2(%0), %%mm0                \n\t"\
2539
        "movd   3(%0), %%mm3                \n\t"\
2540
        "punpcklbw %%mm7, %%mm0                \n\t"\
2541
        "punpcklbw %%mm7, %%mm3                \n\t"\
2542
        "paddw %%mm3, %%mm0                \n\t"\
2543
        "psllw $2, %%mm2                \n\t"\
2544
        "psubw %%mm1, %%mm2                \n\t"\
2545
        "pmullw %%mm4, %%mm2                \n\t"\
2546
        "paddw %%mm5, %%mm0                \n\t"\
2547
        "paddw %%mm2, %%mm0                \n\t"\
2548
        "psraw $5, %%mm0                \n\t"\
2549
        "packuswb %%mm0, %%mm0                \n\t"\
2550
        OP(%%mm0, (%1),%%mm6, d)\
2551
        "add %3, %0                        \n\t"\
2552
        "add %4, %1                        \n\t"\
2553
        "decl %2                        \n\t"\
2554
        " jnz 1b                        \n\t"\
2555
        : "+a"(src), "+c"(dst), "+m"(h)\
2556
        : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2557
        : "memory"\
2558
    );\
2559
}\
2560
static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2561
    src -= 2*srcStride;\
2562
    asm volatile(\
2563
        "pxor %%mm7, %%mm7                \n\t"\
2564
        "movd (%0), %%mm0                \n\t"\
2565
        "add %2, %0                        \n\t"\
2566
        "movd (%0), %%mm1                \n\t"\
2567
        "add %2, %0                        \n\t"\
2568
        "movd (%0), %%mm2                \n\t"\
2569
        "add %2, %0                        \n\t"\
2570
        "movd (%0), %%mm3                \n\t"\
2571
        "add %2, %0                        \n\t"\
2572
        "movd (%0), %%mm4                \n\t"\
2573
        "add %2, %0                        \n\t"\
2574
        "punpcklbw %%mm7, %%mm0                \n\t"\
2575
        "punpcklbw %%mm7, %%mm1                \n\t"\
2576
        "punpcklbw %%mm7, %%mm2                \n\t"\
2577
        "punpcklbw %%mm7, %%mm3                \n\t"\
2578
        "punpcklbw %%mm7, %%mm4                \n\t"\
2579
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2580
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2581
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
2582
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
2583
         \
2584
        : "+a"(src), "+c"(dst)\
2585
        : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2586
        : "memory"\
2587
    );\
2588
}\
2589
static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2590
    int h=4;\
2591
    int w=3;\
2592
    src -= 2*srcStride+2;\
2593
    while(w--){\
2594
        asm volatile(\
2595
            "pxor %%mm7, %%mm7                        \n\t"\
2596
            "movd (%0), %%mm0                        \n\t"\
2597
            "add %2, %0                                \n\t"\
2598
            "movd (%0), %%mm1                        \n\t"\
2599
            "add %2, %0                                \n\t"\
2600
            "movd (%0), %%mm2                        \n\t"\
2601
            "add %2, %0                                \n\t"\
2602
            "movd (%0), %%mm3                        \n\t"\
2603
            "add %2, %0                                \n\t"\
2604
            "movd (%0), %%mm4                        \n\t"\
2605
            "add %2, %0                                \n\t"\
2606
            "punpcklbw %%mm7, %%mm0                \n\t"\
2607
            "punpcklbw %%mm7, %%mm1                \n\t"\
2608
            "punpcklbw %%mm7, %%mm2                \n\t"\
2609
            "punpcklbw %%mm7, %%mm3                \n\t"\
2610
            "punpcklbw %%mm7, %%mm4                \n\t"\
2611
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
2612
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
2613
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
2614
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
2615
             \
2616
            : "+a"(src)\
2617
            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
2618
            : "memory"\
2619
        );\
2620
        tmp += 4;\
2621
        src += 4 - 9*srcStride;\
2622
    }\
2623
    tmp -= 3*4;\
2624
    asm volatile(\
2625
        "movq %4, %%mm6                        \n\t"\
2626
        "1:                                \n\t"\
2627
        "movq     (%0), %%mm0                \n\t"\
2628
        "paddw  10(%0), %%mm0                \n\t"\
2629
        "movq    2(%0), %%mm1                \n\t"\
2630
        "paddw   8(%0), %%mm1                \n\t"\
2631
        "movq    4(%0), %%mm2                \n\t"\
2632
        "paddw   6(%0), %%mm2                \n\t"\
2633
        "psubw %%mm1, %%mm0                \n\t"/*a-b   (abccba)*/\
2634
        "psraw $2, %%mm0                \n\t"/*(a-b)/4 */\
2635
        "psubw %%mm1, %%mm0                \n\t"/*(a-b)/4-b */\
2636
        "paddsw %%mm2, %%mm0                \n\t"\
2637
        "psraw $2, %%mm0                \n\t"/*((a-b)/4-b)/4 */\
2638
        "paddw %%mm6, %%mm2                \n\t"\
2639
        "paddw %%mm2, %%mm0                \n\t"\
2640
        "psraw $6, %%mm0                \n\t"\
2641
        "packuswb %%mm0, %%mm0                \n\t"\
2642
        OP(%%mm0, (%1),%%mm7, d)\
2643
        "add $24, %0                        \n\t"\
2644
        "add %3, %1                        \n\t"\
2645
        "decl %2                        \n\t"\
2646
        " jnz 1b                        \n\t"\
2647
        : "+a"(tmp), "+c"(dst), "+m"(h)\
2648
        : "S"((long)dstStride), "m"(ff_pw_32)\
2649
        : "memory"\
2650
    );\
2651
}\
2652
\
2653
static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2654
    int h=8;\
2655
    asm volatile(\
2656
        "pxor %%mm7, %%mm7                \n\t"\
2657
        "movq %5, %%mm6                        \n\t"\
2658
        "1:                                \n\t"\
2659
        "movq    (%0), %%mm0                \n\t"\
2660
        "movq   1(%0), %%mm2                \n\t"\
2661
        "movq %%mm0, %%mm1                \n\t"\
2662
        "movq %%mm2, %%mm3                \n\t"\
2663
        "punpcklbw %%mm7, %%mm0                \n\t"\
2664
        "punpckhbw %%mm7, %%mm1                \n\t"\
2665
        "punpcklbw %%mm7, %%mm2                \n\t"\
2666
        "punpckhbw %%mm7, %%mm3                \n\t"\
2667
        "paddw %%mm2, %%mm0                \n\t"\
2668
        "paddw %%mm3, %%mm1                \n\t"\
2669
        "psllw $2, %%mm0                \n\t"\
2670
        "psllw $2, %%mm1                \n\t"\
2671
        "movq   -1(%0), %%mm2                \n\t"\
2672
        "movq    2(%0), %%mm4                \n\t"\
2673
        "movq %%mm2, %%mm3                \n\t"\
2674
        "movq %%mm4, %%mm5                \n\t"\
2675
        "punpcklbw %%mm7, %%mm2                \n\t"\
2676
        "punpckhbw %%mm7, %%mm3                \n\t"\
2677
        "punpcklbw %%mm7, %%mm4                \n\t"\
2678
        "punpckhbw %%mm7, %%mm5                \n\t"\
2679
        "paddw %%mm4, %%mm2                \n\t"\
2680
        "paddw %%mm3, %%mm5                \n\t"\
2681
        "psubw %%mm2, %%mm0                \n\t"\
2682
        "psubw %%mm5, %%mm1                \n\t"\
2683
        "pmullw %%mm6, %%mm0                \n\t"\
2684
        "pmullw %%mm6, %%mm1                \n\t"\
2685
        "movd   -2(%0), %%mm2                \n\t"\
2686
        "movd    7(%0), %%mm5                \n\t"\
2687
        "punpcklbw %%mm7, %%mm2                \n\t"\
2688
        "punpcklbw %%mm7, %%mm5                \n\t"\
2689
        "paddw %%mm3, %%mm2                \n\t"\
2690
        "paddw %%mm5, %%mm4                \n\t"\
2691
        "movq %6, %%mm5                        \n\t"\
2692
        "paddw %%mm5, %%mm2                \n\t"\
2693
        "paddw %%mm5, %%mm4                \n\t"\
2694
        "paddw %%mm2, %%mm0                \n\t"\
2695
        "paddw %%mm4, %%mm1                \n\t"\
2696
        "psraw $5, %%mm0                \n\t"\
2697
        "psraw $5, %%mm1                \n\t"\
2698
        "packuswb %%mm1, %%mm0                \n\t"\
2699
        OP(%%mm0, (%1),%%mm5, q)\
2700
        "add %3, %0                        \n\t"\
2701
        "add %4, %1                        \n\t"\
2702
        "decl %2                        \n\t"\
2703
        " jnz 1b                        \n\t"\
2704
        : "+a"(src), "+c"(dst), "+m"(h)\
2705
        : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2706
        : "memory"\
2707
    );\
2708
}\
2709
\
2710
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2711
    int h= 2;\
2712
    src -= 2*srcStride;\
2713
    \
2714
    while(h--){\
2715
      asm volatile(\
2716
        "pxor %%mm7, %%mm7                \n\t"\
2717
        "movd (%0), %%mm0                \n\t"\
2718
        "add %2, %0                        \n\t"\
2719
        "movd (%0), %%mm1                \n\t"\
2720
        "add %2, %0                        \n\t"\
2721
        "movd (%0), %%mm2                \n\t"\
2722
        "add %2, %0                        \n\t"\
2723
        "movd (%0), %%mm3                \n\t"\
2724
        "add %2, %0                        \n\t"\
2725
        "movd (%0), %%mm4                \n\t"\
2726
        "add %2, %0                        \n\t"\
2727
        "punpcklbw %%mm7, %%mm0                \n\t"\
2728
        "punpcklbw %%mm7, %%mm1                \n\t"\
2729
        "punpcklbw %%mm7, %%mm2                \n\t"\
2730
        "punpcklbw %%mm7, %%mm3                \n\t"\
2731
        "punpcklbw %%mm7, %%mm4                \n\t"\
2732
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2733
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2734
        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
2735
        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
2736
        QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
2737
        QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
2738
        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2739
        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2740
         \
2741
        : "+a"(src), "+c"(dst)\
2742
        : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2743
        : "memory"\
2744
     );\
2745
     src += 4-13*srcStride;\
2746
     dst +=  4-8*dstStride;\
2747
   }\
2748
}\
2749
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2750
    int h=8;\
2751
    int w=4;\
2752
    src -= 2*srcStride+2;\
2753
    while(w--){\
2754
        asm volatile(\
2755
            "pxor %%mm7, %%mm7                        \n\t"\
2756
            "movd (%0), %%mm0                        \n\t"\
2757
            "add %2, %0                                \n\t"\
2758
            "movd (%0), %%mm1                        \n\t"\
2759
            "add %2, %0                                \n\t"\
2760
            "movd (%0), %%mm2                        \n\t"\
2761
            "add %2, %0                                \n\t"\
2762
            "movd (%0), %%mm3                        \n\t"\
2763
            "add %2, %0                                \n\t"\
2764
            "movd (%0), %%mm4                        \n\t"\
2765
            "add %2, %0                                \n\t"\
2766
            "punpcklbw %%mm7, %%mm0                \n\t"\
2767
            "punpcklbw %%mm7, %%mm1                \n\t"\
2768
            "punpcklbw %%mm7, %%mm2                \n\t"\
2769
            "punpcklbw %%mm7, %%mm3                \n\t"\
2770
            "punpcklbw %%mm7, %%mm4                \n\t"\
2771
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
2772
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
2773
            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
2774
            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
2775
            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
2776
            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
2777
            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
2778
            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
2779
             \
2780
            : "+a"(src)\
2781
            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
2782
            : "memory"\
2783
        );\
2784
        tmp += 4;\
2785
        src += 4 - 13*srcStride;\
2786
    }\
2787
    tmp -= 4*4;\
2788
    asm volatile(\
2789
        "movq %4, %%mm6                        \n\t"\
2790
        "1:                                \n\t"\
2791
        "movq     (%0), %%mm0                \n\t"\
2792
        "movq    8(%0), %%mm3                \n\t"\
2793
        "movq    2(%0), %%mm1                \n\t"\
2794
        "movq   10(%0), %%mm4                \n\t"\
2795
        "paddw   %%mm4, %%mm0                \n\t"\
2796
        "paddw   %%mm3, %%mm1                \n\t"\
2797
        "paddw  18(%0), %%mm3                \n\t"\
2798
        "paddw  16(%0), %%mm4                \n\t"\
2799
        "movq    4(%0), %%mm2                \n\t"\
2800
        "movq   12(%0), %%mm5                \n\t"\
2801
        "paddw   6(%0), %%mm2                \n\t"\
2802
        "paddw  14(%0), %%mm5                \n\t"\
2803
        "psubw %%mm1, %%mm0                \n\t"\
2804
        "psubw %%mm4, %%mm3                \n\t"\
2805
        "psraw $2, %%mm0                \n\t"\
2806
        "psraw $2, %%mm3                \n\t"\
2807
        "psubw %%mm1, %%mm0                \n\t"\
2808
        "psubw %%mm4, %%mm3                \n\t"\
2809
        "paddsw %%mm2, %%mm0                \n\t"\
2810
        "paddsw %%mm5, %%mm3                \n\t"\
2811
        "psraw $2, %%mm0                \n\t"\
2812
        "psraw $2, %%mm3                \n\t"\
2813
        "paddw %%mm6, %%mm2                \n\t"\
2814
        "paddw %%mm6, %%mm5                \n\t"\
2815
        "paddw %%mm2, %%mm0                \n\t"\
2816
        "paddw %%mm5, %%mm3                \n\t"\
2817
        "psraw $6, %%mm0                \n\t"\
2818
        "psraw $6, %%mm3                \n\t"\
2819
        "packuswb %%mm3, %%mm0                \n\t"\
2820
        OP(%%mm0, (%1),%%mm7, q)\
2821
        "add $32, %0                        \n\t"\
2822
        "add %3, %1                        \n\t"\
2823
        "decl %2                        \n\t"\
2824
        " jnz 1b                        \n\t"\
2825
        : "+a"(tmp), "+c"(dst), "+m"(h)\
2826
        : "S"((long)dstStride), "m"(ff_pw_32)\
2827
        : "memory"\
2828
    );\
2829
}\
2830
static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2831
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2832
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2833
    src += 8*srcStride;\
2834
    dst += 8*dstStride;\
2835
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2836
    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2837
}\
2838
\
2839
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2840
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2841
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2842
    src += 8*srcStride;\
2843
    dst += 8*dstStride;\
2844
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
2845
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2846
}\
2847
\
2848
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2849
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2850
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
2851
    src += 8*srcStride;\
2852
    dst += 8*dstStride;\
2853
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2854
    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
2855
}\
2856

    
2857
#define H264_MC(OPNAME, SIZE, MMX) \
2858
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2859
    OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
2860
}\
2861
\
2862
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2863
    uint64_t temp[SIZE*SIZE/8];\
2864
    uint8_t * const half= (uint8_t*)temp;\
2865
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
2866
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
2867
}\
2868
\
2869
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2870
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
2871
}\
2872
\
2873
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2874
    uint64_t temp[SIZE*SIZE/8];\
2875
    uint8_t * const half= (uint8_t*)temp;\
2876
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
2877
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\
2878
}\
2879
\
2880
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2881
    uint64_t temp[SIZE*SIZE/8];\
2882
    uint8_t * const half= (uint8_t*)temp;\
2883
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
2884
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
2885
}\
2886
\
2887
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2888
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
2889
}\
2890
\
2891
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2892
    uint64_t temp[SIZE*SIZE/8];\
2893
    uint8_t * const half= (uint8_t*)temp;\
2894
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
2895
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
2896
}\
2897
\
2898
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2899
    uint64_t temp[SIZE*SIZE/4];\
2900
    uint8_t * const halfH= (uint8_t*)temp;\
2901
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2902
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2903
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2904
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2905
}\
2906
\
2907
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2908
    uint64_t temp[SIZE*SIZE/4];\
2909
    uint8_t * const halfH= (uint8_t*)temp;\
2910
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2911
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2912
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2913
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2914
}\
2915
\
2916
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2917
    uint64_t temp[SIZE*SIZE/4];\
2918
    uint8_t * const halfH= (uint8_t*)temp;\
2919
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2920
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2921
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2922
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2923
}\
2924
\
2925
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2926
    uint64_t temp[SIZE*SIZE/4];\
2927
    uint8_t * const halfH= (uint8_t*)temp;\
2928
    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2929
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2930
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2931
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2932
}\
2933
\
2934
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2935
    uint64_t temp[SIZE*(SIZE+8)/4];\
2936
    int16_t * const tmp= (int16_t*)temp;\
2937
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
2938
}\
2939
\
2940
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2941
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
2942
    uint8_t * const halfH= (uint8_t*)temp;\
2943
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
2944
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
2945
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2946
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2947
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
2948
}\
2949
\
2950
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2951
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
2952
    uint8_t * const halfH= (uint8_t*)temp;\
2953
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
2954
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
2955
    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2956
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2957
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
2958
}\
2959
\
2960
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2961
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
2962
    uint8_t * const halfV= (uint8_t*)temp;\
2963
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
2964
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
2965
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2966
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2967
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
2968
}\
2969
\
2970
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2971
    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
2972
    uint8_t * const halfV= (uint8_t*)temp;\
2973
    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
2974
    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
2975
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2976
    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2977
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
2978
}\
2979

    
2980

    
2981
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
2982
#define AVG_3DNOW_OP(a,b,temp, size) \
2983
"mov" #size " " #b ", " #temp "        \n\t"\
2984
"pavgusb " #temp ", " #a "        \n\t"\
2985
"mov" #size " " #a ", " #b "        \n\t"
2986
#define AVG_MMX2_OP(a,b,temp, size) \
2987
"mov" #size " " #b ", " #temp "        \n\t"\
2988
"pavgb " #temp ", " #a "        \n\t"\
2989
"mov" #size " " #a ", " #b "        \n\t"
2990

    
2991
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
2992
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
2993
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2994
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
2995
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
2996
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2997
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
2998
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
2999
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
3000

    
3001
QPEL_H264(put_       ,       PUT_OP, 3dnow)
3002
QPEL_H264(avg_       , AVG_3DNOW_OP, 3dnow)
3003
QPEL_H264(put_       ,       PUT_OP, mmx2)
3004
QPEL_H264(avg_       ,  AVG_MMX2_OP, mmx2)
3005

    
3006
H264_MC(put_, 4, 3dnow)
3007
H264_MC(put_, 8, 3dnow)
3008
H264_MC(put_, 16,3dnow)
3009
H264_MC(avg_, 4, 3dnow)
3010
H264_MC(avg_, 8, 3dnow)
3011
H264_MC(avg_, 16,3dnow)
3012
H264_MC(put_, 4, mmx2)
3013
H264_MC(put_, 8, mmx2)
3014
H264_MC(put_, 16,mmx2)
3015
H264_MC(avg_, 4, mmx2)
3016
H264_MC(avg_, 8, mmx2)
3017
H264_MC(avg_, 16,mmx2)
3018

    
3019
#if 0
3020
static void just_return() { return; }
3021
#endif
3022

    
3023
#define SET_QPEL_FUNC(postfix1, postfix2) \
3024
    c->put_ ## postfix1 = put_ ## postfix2;\
3025
    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
3026
    c->avg_ ## postfix1 = avg_ ## postfix2;
3027

    
3028
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3029
    long i=0;
3030
    
3031
    assert(ABS(scale) < 256);
3032
    scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
3033

    
3034
    asm volatile(
3035
        "pcmpeqw %%mm6, %%mm6                \n\t" // -1w
3036
        "psrlw $15, %%mm6                \n\t" //  1w
3037
        "pxor %%mm7, %%mm7                \n\t"
3038
        "movd  %4, %%mm5                \n\t" 
3039
        "punpcklwd %%mm5, %%mm5                \n\t" 
3040
        "punpcklwd %%mm5, %%mm5                \n\t" 
3041
        "1:                                \n\t"
3042
        "movq  (%1, %0), %%mm0                \n\t" 
3043
        "movq  8(%1, %0), %%mm1                \n\t"
3044
        "pmulhw %%mm5, %%mm0                \n\t"
3045
        "pmulhw %%mm5, %%mm1                \n\t"
3046
        "paddw %%mm6, %%mm0                \n\t"
3047
        "paddw %%mm6, %%mm1                \n\t"
3048
        "psraw $1, %%mm0                \n\t"
3049
        "psraw $1, %%mm1                \n\t"
3050
        "paddw (%2, %0), %%mm0                \n\t"
3051
        "paddw 8(%2, %0), %%mm1                \n\t"
3052
        "psraw $6, %%mm0                \n\t"
3053
        "psraw $6, %%mm1                \n\t"
3054
        "pmullw (%3, %0), %%mm0                \n\t"
3055
        "pmullw 8(%3, %0), %%mm1        \n\t"
3056
        "pmaddwd %%mm0, %%mm0                \n\t"
3057
        "pmaddwd %%mm1, %%mm1                \n\t"
3058
        "paddd %%mm1, %%mm0                \n\t"
3059
        "psrld $4, %%mm0                \n\t"
3060
        "paddd %%mm0, %%mm7                \n\t"
3061
        "add $16, %0                        \n\t"
3062
        "cmp $128, %0                        \n\t" //FIXME optimize & bench
3063
        " jb 1b                                \n\t"
3064
        "movq %%mm7, %%mm6                \n\t"
3065
        "psrlq $32, %%mm7                \n\t"
3066
        "paddd %%mm6, %%mm7                \n\t"
3067
        "psrld $2, %%mm7                \n\t"
3068
        "movd %%mm7, %0                        \n\t"
3069
        
3070
        : "+r" (i)
3071
        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
3072
    );
3073
    return i;
3074
}
3075

    
3076
static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
3077
    long i=0;
3078
    
3079
    if(ABS(scale) < 256){
3080
        scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
3081
        asm volatile(
3082
                "pcmpeqw %%mm6, %%mm6                \n\t" // -1w
3083
                "psrlw $15, %%mm6                \n\t" //  1w
3084
                "movd  %3, %%mm5                \n\t" 
3085
                "punpcklwd %%mm5, %%mm5                \n\t" 
3086
                "punpcklwd %%mm5, %%mm5                \n\t" 
3087
                "1:                                \n\t"
3088
                "movq  (%1, %0), %%mm0                \n\t" 
3089
                "movq  8(%1, %0), %%mm1                \n\t"
3090
                "pmulhw %%mm5, %%mm0                \n\t"
3091
                "pmulhw %%mm5, %%mm1                \n\t"
3092
                "paddw %%mm6, %%mm0                \n\t" 
3093
                "paddw %%mm6, %%mm1                \n\t"
3094
                "psraw $1, %%mm0                \n\t"
3095
                "psraw $1, %%mm1                \n\t"
3096
                "paddw (%2, %0), %%mm0                \n\t"
3097
                "paddw 8(%2, %0), %%mm1                \n\t"
3098
                "movq %%mm0, (%2, %0)                \n\t"
3099
                "movq %%mm1, 8(%2, %0)                \n\t"
3100
                "add $16, %0                        \n\t"
3101
                "cmp $128, %0                        \n\t" //FIXME optimize & bench
3102
                " jb 1b                                \n\t"
3103
                
3104
                : "+r" (i)
3105
                : "r"(basis), "r"(rem), "g"(scale)
3106
        );
3107
    }else{