Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ 11f18faf

History | View | Annotate | Download (22.3 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20
 */
21

    
22
#include "../dsputil.h"
23

    
24
int mm_flags; /* multimedia extension flags */
25
/* FIXME use them in static form */
26
int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27
int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28
int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29
int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30

    
31
int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32
int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33
int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34
int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35

    
36
int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37
int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38
int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39
int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40

    
41
int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42
int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43
int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44
int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45

    
46
/* pixel operations */
47
static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
48
static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
49
static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
50

    
51
#define JUMPALIGN() __asm __volatile (".balign 8"::)
52
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
53

    
54
#define MOVQ_WONE(regd) \
55
    __asm __volatile ( \
56
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
57
    "psrlw $15, %%" #regd ::)
58

    
59
#define MOVQ_BFE(regd) \
60
    __asm __volatile ( \
61
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
62
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
63

    
64
#ifndef PIC
65
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
66
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
67
#else
68
// for shared library it's better to use this way for accessing constants
69
// pcmpeqd -> -1
70
#define MOVQ_BONE(regd) \
71
    __asm __volatile ( \
72
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73
    "psrlw $15, %%" #regd " \n\t" \
74
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
75

    
76
#define MOVQ_WTWO(regd) \
77
    __asm __volatile ( \
78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79
    "psrlw $15, %%" #regd " \n\t" \
80
    "psllw $1, %%" #regd " \n\t"::)
81

    
82
#endif
83

    
84
// using regr as temporary and for the output result
85
// first argument is unmodifed and second is trashed
86
// regfe is supposed to contain 0xfefefefefefefefe
87
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
88
    "movq " #rega ", " #regr "        \n\t"\
89
    "pand " #regb ", " #regr "        \n\t"\
90
    "pxor " #rega ", " #regb "        \n\t"\
91
    "pand " #regfe "," #regb "        \n\t"\
92
    "psrlq $1, " #regb "         \n\t"\
93
    "paddb " #regb ", " #regr "        \n\t"
94

    
95
#define PAVGB_MMX(rega, regb, regr, regfe) \
96
    "movq " #rega ", " #regr "        \n\t"\
97
    "por  " #regb ", " #regr "        \n\t"\
98
    "pxor " #rega ", " #regb "        \n\t"\
99
    "pand " #regfe "," #regb "        \n\t"\
100
    "psrlq $1, " #regb "        \n\t"\
101
    "psubb " #regb ", " #regr "        \n\t"
102

    
103
// mm6 is supposed to contain 0xfefefefefefefefe
104
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
105
    "movq " #rega ", " #regr "        \n\t"\
106
    "movq " #regc ", " #regp "        \n\t"\
107
    "pand " #regb ", " #regr "        \n\t"\
108
    "pand " #regd ", " #regp "        \n\t"\
109
    "pxor " #rega ", " #regb "        \n\t"\
110
    "pxor " #regc ", " #regd "        \n\t"\
111
    "pand %%mm6, " #regb "        \n\t"\
112
    "pand %%mm6, " #regd "        \n\t"\
113
    "psrlq $1, " #regb "         \n\t"\
114
    "psrlq $1, " #regd "         \n\t"\
115
    "paddb " #regb ", " #regr "        \n\t"\
116
    "paddb " #regd ", " #regp "        \n\t"
117

    
118
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
119
    "movq " #rega ", " #regr "        \n\t"\
120
    "movq " #regc ", " #regp "        \n\t"\
121
    "por  " #regb ", " #regr "        \n\t"\
122
    "por  " #regd ", " #regp "        \n\t"\
123
    "pxor " #rega ", " #regb "        \n\t"\
124
    "pxor " #regc ", " #regd "        \n\t"\
125
    "pand %%mm6, " #regb "             \n\t"\
126
    "pand %%mm6, " #regd "             \n\t"\
127
    "psrlq $1, " #regd "        \n\t"\
128
    "psrlq $1, " #regb "        \n\t"\
129
    "psubb " #regb ", " #regr "        \n\t"\
130
    "psubb " #regd ", " #regp "        \n\t"
131

    
132
/***********************************/
133
/* MMX no rounding */
134
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
135
#define SET_RND  MOVQ_WONE
136
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
137
#define PAVGB(a, b, c, e)                PAVGB_MMX_NO_RND(a, b, c, e)
138

    
139
#include "dsputil_mmx_rnd.h"
140

    
141
#undef DEF
142
#undef SET_RND
143
#undef PAVGBP
144
#undef PAVGB
145
/***********************************/
146
/* MMX rounding */
147

    
148
#define DEF(x, y) x ## _ ## y ##_mmx
149
#define SET_RND  MOVQ_WTWO
150
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
151
#define PAVGB(a, b, c, e)                PAVGB_MMX(a, b, c, e)
152

    
153
#include "dsputil_mmx_rnd.h"
154

    
155
#undef DEF
156
#undef SET_RND
157
#undef PAVGBP
158
#undef PAVGB
159

    
160
/***********************************/
161
/* 3Dnow specific */
162

    
163
#define DEF(x) x ## _3dnow
164
/* for Athlons PAVGUSB is prefered */
165
#define PAVGB "pavgusb"
166

    
167
#include "dsputil_mmx_avg.h"
168

    
169
#undef DEF
170
#undef PAVGB
171

    
172
/***********************************/
173
/* MMX2 specific */
174

    
175
#define DEF(x) x ## _mmx2
176

    
177
/* Introduced only in MMX2 set */
178
#define PAVGB "pavgb"
179

    
180
#include "dsputil_mmx_avg.h"
181

    
182
#undef DEF
183
#undef PAVGB
184

    
185
/***********************************/
186
/* standard MMX */
187

    
188
static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
189
{
190
    asm volatile(
191
        "movl $-128, %%eax        \n\t"
192
        "pxor %%mm7, %%mm7        \n\t"
193
        ".balign 16                \n\t"
194
        "1:                        \n\t"
195
        "movq (%0), %%mm0        \n\t"
196
        "movq (%0, %2), %%mm2        \n\t"
197
        "movq %%mm0, %%mm1        \n\t"
198
        "movq %%mm2, %%mm3        \n\t"
199
        "punpcklbw %%mm7, %%mm0        \n\t"
200
        "punpckhbw %%mm7, %%mm1        \n\t"
201
        "punpcklbw %%mm7, %%mm2        \n\t"
202
        "punpckhbw %%mm7, %%mm3        \n\t"
203
        "movq %%mm0, (%1, %%eax)\n\t"
204
        "movq %%mm1, 8(%1, %%eax)\n\t"
205
        "movq %%mm2, 16(%1, %%eax)\n\t"
206
        "movq %%mm3, 24(%1, %%eax)\n\t"
207
        "addl %3, %0                \n\t"
208
        "addl $32, %%eax        \n\t"
209
        "js 1b                        \n\t"
210
        : "+r" (pixels)
211
        : "r" (block+64), "r" (line_size), "r" (line_size*2)
212
        : "%eax"
213
    );
214
}
215

    
216
static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
217
{
218
    asm volatile(
219
        "pxor %%mm7, %%mm7        \n\t"
220
        "movl $-128, %%eax        \n\t"
221
        ".balign 16                \n\t"
222
        "1:                        \n\t"
223
        "movq (%0), %%mm0        \n\t"
224
        "movq (%1), %%mm2        \n\t"
225
        "movq %%mm0, %%mm1        \n\t"
226
        "movq %%mm2, %%mm3        \n\t"
227
        "punpcklbw %%mm7, %%mm0        \n\t"
228
        "punpckhbw %%mm7, %%mm1        \n\t"
229
        "punpcklbw %%mm7, %%mm2        \n\t"
230
        "punpckhbw %%mm7, %%mm3        \n\t"
231
        "psubw %%mm2, %%mm0        \n\t"
232
        "psubw %%mm3, %%mm1        \n\t"
233
        "movq %%mm0, (%2, %%eax)\n\t"
234
        "movq %%mm1, 8(%2, %%eax)\n\t"
235
        "addl %3, %0                \n\t"
236
        "addl %3, %1                \n\t"
237
        "addl $16, %%eax        \n\t"
238
        "jnz 1b                        \n\t"
239
        : "+r" (s1), "+r" (s2)
240
        : "r" (block+64), "r" (stride)
241
        : "%eax"
242
    );
243
}
244

    
245
void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
246
{
247
    const DCTELEM *p;
248
    UINT8 *pix;
249

    
250
    /* read the pixels */
251
    p = block;
252
    pix = pixels;
253
    /* unrolled loop */
254
        __asm __volatile(
255
                "movq        %3, %%mm0\n\t"
256
                "movq        8%3, %%mm1\n\t"
257
                "movq        16%3, %%mm2\n\t"
258
                "movq        24%3, %%mm3\n\t"
259
                "movq        32%3, %%mm4\n\t"
260
                "movq        40%3, %%mm5\n\t"
261
                "movq        48%3, %%mm6\n\t"
262
                "movq        56%3, %%mm7\n\t"
263
                "packuswb %%mm1, %%mm0\n\t"
264
                "packuswb %%mm3, %%mm2\n\t"
265
                "packuswb %%mm5, %%mm4\n\t"
266
                "packuswb %%mm7, %%mm6\n\t"
267
                "movq        %%mm0, (%0)\n\t"
268
                "movq        %%mm2, (%0, %1)\n\t"
269
                "movq        %%mm4, (%0, %1, 2)\n\t"
270
                "movq        %%mm6, (%0, %2)\n\t"
271
                ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
272
                :"memory");
273
        pix += line_size*4;
274
        p += 32;
275

    
276
    // if here would be an exact copy of the code above
277
    // compiler would generate some very strange code
278
    // thus using "r"
279
    __asm __volatile(
280
            "movq        (%3), %%mm0\n\t"
281
            "movq        8(%3), %%mm1\n\t"
282
            "movq        16(%3), %%mm2\n\t"
283
            "movq        24(%3), %%mm3\n\t"
284
            "movq        32(%3), %%mm4\n\t"
285
            "movq        40(%3), %%mm5\n\t"
286
            "movq        48(%3), %%mm6\n\t"
287
            "movq        56(%3), %%mm7\n\t"
288
            "packuswb %%mm1, %%mm0\n\t"
289
            "packuswb %%mm3, %%mm2\n\t"
290
            "packuswb %%mm5, %%mm4\n\t"
291
            "packuswb %%mm7, %%mm6\n\t"
292
            "movq        %%mm0, (%0)\n\t"
293
            "movq        %%mm2, (%0, %1)\n\t"
294
            "movq        %%mm4, (%0, %1, 2)\n\t"
295
            "movq        %%mm6, (%0, %2)\n\t"
296
            ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
297
            :"memory");
298
}
299

    
300
void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
301
{
302
    const DCTELEM *p;
303
    UINT8 *pix;
304
    int i;
305

    
306
    /* read the pixels */
307
    p = block;
308
    pix = pixels;
309
    MOVQ_ZERO(mm7);
310
    i = 4;
311
    do {
312
        __asm __volatile(
313
                "movq        (%2), %%mm0\n\t"
314
                "movq        8(%2), %%mm1\n\t"
315
                "movq        16(%2), %%mm2\n\t"
316
                "movq        24(%2), %%mm3\n\t"
317
                "movq        %0, %%mm4\n\t"
318
                "movq        %1, %%mm6\n\t"
319
                "movq        %%mm4, %%mm5\n\t"
320
                "punpcklbw %%mm7, %%mm4\n\t"
321
                "punpckhbw %%mm7, %%mm5\n\t"
322
                "paddsw        %%mm4, %%mm0\n\t"
323
                "paddsw        %%mm5, %%mm1\n\t"
324
                "movq        %%mm6, %%mm5\n\t"
325
                "punpcklbw %%mm7, %%mm6\n\t"
326
                "punpckhbw %%mm7, %%mm5\n\t"
327
                "paddsw        %%mm6, %%mm2\n\t"
328
                "paddsw        %%mm5, %%mm3\n\t"
329
                "packuswb %%mm1, %%mm0\n\t"
330
                "packuswb %%mm3, %%mm2\n\t"
331
                "movq        %%mm0, %0\n\t"
332
                "movq        %%mm2, %1\n\t"
333
                :"+m"(*pix), "+m"(*(pix+line_size))
334
                :"r"(p)
335
                :"memory");
336
        pix += line_size*2;
337
        p += 16;
338
    } while (--i);
339
}
340

    
341
static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
342
{
343
    __asm __volatile(
344
         "lea (%3, %3), %%eax                \n\t"
345
         ".balign 8                        \n\t"
346
         "1:                                \n\t"
347
         "movq (%1), %%mm0                \n\t"
348
         "movq (%1, %3), %%mm1                \n\t"
349
              "movq %%mm0, (%2)                \n\t"
350
         "movq %%mm1, (%2, %3)                \n\t"
351
         "addl %%eax, %1                \n\t"
352
         "addl %%eax, %2                       \n\t"
353
         "movq (%1), %%mm0                \n\t"
354
         "movq (%1, %3), %%mm1                \n\t"
355
         "movq %%mm0, (%2)                \n\t"
356
         "movq %%mm1, (%2, %3)                \n\t"
357
         "addl %%eax, %1                \n\t"
358
         "addl %%eax, %2                       \n\t"
359
         "subl $4, %0                        \n\t"
360
         "jnz 1b                        \n\t"
361
         : "+g"(h), "+r" (pixels),  "+r" (block)
362
         : "r"(line_size)
363
         : "%eax", "memory"
364
        );
365
}
366

    
367
static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
368
{
369
    __asm __volatile(
370
         "lea (%3, %3), %%eax                \n\t"
371
         ".balign 8                        \n\t"
372
         "1:                                \n\t"
373
         "movq (%1), %%mm0                \n\t"
374
         "movq 8(%1), %%mm4                \n\t"
375
         "movq (%1, %3), %%mm1                \n\t"
376
         "movq 8(%1, %3), %%mm5                \n\t"
377
              "movq %%mm0, (%2)                \n\t"
378
              "movq %%mm4, 8(%2)                \n\t"
379
         "movq %%mm1, (%2, %3)                \n\t"
380
         "movq %%mm5, 8(%2, %3)                \n\t"
381
         "addl %%eax, %1                \n\t"
382
         "addl %%eax, %2                       \n\t"
383
         "movq (%1), %%mm0                \n\t"
384
         "movq 8(%1), %%mm4                \n\t"
385
         "movq (%1, %3), %%mm1                \n\t"
386
         "movq 8(%1, %3), %%mm5                \n\t"
387
         "movq %%mm0, (%2)                \n\t"
388
         "movq %%mm4, 8(%2)                \n\t"
389
         "movq %%mm1, (%2, %3)                \n\t"
390
         "movq %%mm5, 8(%2, %3)                \n\t"
391
         "addl %%eax, %1                \n\t"
392
         "addl %%eax, %2                       \n\t"
393
         "subl $4, %0                        \n\t"
394
         "jnz 1b                        \n\t"
395
         : "+g"(h), "+r" (pixels),  "+r" (block)
396
         : "r"(line_size)
397
         : "%eax", "memory"
398
        );
399
}
400

    
401
static void clear_blocks_mmx(DCTELEM *blocks)
402
{
403
    __asm __volatile(
404
                "pxor %%mm7, %%mm7                \n\t"
405
                "movl $-128*6, %%eax                \n\t"
406
                "1:                                \n\t"
407
                "movq %%mm7, (%0, %%eax)        \n\t"
408
                "movq %%mm7, 8(%0, %%eax)        \n\t"
409
                "movq %%mm7, 16(%0, %%eax)        \n\t"
410
                "movq %%mm7, 24(%0, %%eax)        \n\t"
411
                "addl $32, %%eax                \n\t"
412
                " js 1b                                \n\t"
413
                : : "r" (((int)blocks)+128*6)
414
                : "%eax"
415
        );
416
}
417

    
418
static int pix_sum16_mmx(UINT8 * pix, int line_size){
419
    const int h=16;
420
    int sum;
421
    int index= -line_size*h;
422

    
423
    __asm __volatile(
424
                "pxor %%mm7, %%mm7                \n\t"
425
                "pxor %%mm6, %%mm6                \n\t"
426
                "1:                                \n\t"
427
                "movq (%2, %1), %%mm0                \n\t"
428
                "movq (%2, %1), %%mm1                \n\t"
429
                "movq 8(%2, %1), %%mm2                \n\t"
430
                "movq 8(%2, %1), %%mm3                \n\t"
431
                "punpcklbw %%mm7, %%mm0                \n\t"
432
                "punpckhbw %%mm7, %%mm1                \n\t"
433
                "punpcklbw %%mm7, %%mm2                \n\t"
434
                "punpckhbw %%mm7, %%mm3                \n\t"
435
                "paddw %%mm0, %%mm1                \n\t"
436
                "paddw %%mm2, %%mm3                \n\t"
437
                "paddw %%mm1, %%mm3                \n\t"
438
                "paddw %%mm3, %%mm6                \n\t"
439
                "addl %3, %1                        \n\t"
440
                " js 1b                                \n\t"
441
                "movq %%mm6, %%mm5                \n\t"
442
                "psrlq $32, %%mm6                \n\t"
443
                "paddw %%mm5, %%mm6                \n\t"
444
                "movq %%mm6, %%mm5                \n\t"
445
                "psrlq $16, %%mm6                \n\t"
446
                "paddw %%mm5, %%mm6                \n\t"
447
                "movd %%mm6, %0                        \n\t"
448
                "andl $0xFFFF, %0                \n\t"
449
                : "=&r" (sum), "+r" (index)
450
                : "r" (pix - index), "r" (line_size)
451
        );
452

    
453
        return sum;
454
}
455

    
456
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
457
    int i=0;
458
    asm volatile(
459
        "1:                                \n\t"
460
        "movq  (%1, %0), %%mm0                \n\t"
461
        "movq  (%2, %0), %%mm1                \n\t"
462
        "paddb %%mm0, %%mm1                \n\t"
463
        "movq %%mm1, (%2, %0)                \n\t"
464
        "movq 8(%1, %0), %%mm0                \n\t"
465
        "movq 8(%2, %0), %%mm1                \n\t"
466
        "paddb %%mm0, %%mm1                \n\t"
467
        "movq %%mm1, 8(%2, %0)                \n\t"
468
        "addl $16, %0                        \n\t"
469
        "cmpl %3, %0                        \n\t"
470
        " jb 1b                                \n\t"
471
        : "+r" (i)
472
        : "r"(src), "r"(dst), "r"(w-15)
473
    );
474
    for(; i<w; i++)
475
        dst[i+0] += src[i+0];
476
}
477

    
478
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
479
    int i=0;
480
    asm volatile(
481
        "1:                                \n\t"
482
        "movq  (%2, %0), %%mm0                \n\t"
483
        "movq  (%1, %0), %%mm1                \n\t"
484
        "psubb %%mm0, %%mm1                \n\t"
485
        "movq %%mm1, (%3, %0)                \n\t"
486
        "movq 8(%2, %0), %%mm0                \n\t"
487
        "movq 8(%1, %0), %%mm1                \n\t"
488
        "psubb %%mm0, %%mm1                \n\t"
489
        "movq %%mm1, 8(%3, %0)                \n\t"
490
        "addl $16, %0                        \n\t"
491
        "cmpl %4, %0                        \n\t"
492
        " jb 1b                                \n\t"
493
        : "+r" (i)
494
        : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
495
    );
496
    for(; i<w; i++)
497
        dst[i+0] = src1[i+0]-src2[i+0];
498
}
499

    
500

    
501
#if 0
502
static void just_return() { return; }
503
#endif
504

    
505
void dsputil_init_mmx(DSPContext* c, unsigned mask)
506
{
507
    mm_flags = mm_support();
508
#if 0
509
    fprintf(stderr, "libavcodec: CPU flags:");
510
    if (mm_flags & MM_MMX)
511
        fprintf(stderr, " mmx");
512
    if (mm_flags & MM_MMXEXT)
513
        fprintf(stderr, " mmxext");
514
    if (mm_flags & MM_3DNOW)
515
        fprintf(stderr, " 3dnow");
516
    if (mm_flags & MM_SSE)
517
        fprintf(stderr, " sse");
518
    if (mm_flags & MM_SSE2)
519
        fprintf(stderr, " sse2");
520
    fprintf(stderr, "\n");
521
#endif
522

    
523
    if (mm_flags & MM_MMX) {
524
        c->get_pixels = get_pixels_mmx;
525
        c->diff_pixels = diff_pixels_mmx;
526
        c->put_pixels_clamped = put_pixels_clamped_mmx;
527
        c->add_pixels_clamped = add_pixels_clamped_mmx;
528
        c->clear_blocks = clear_blocks_mmx;
529
        c->pix_sum = pix_sum16_mmx;
530

    
531
        c->pix_abs16x16     = pix_abs16x16_mmx;
532
        c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
533
        c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
534
        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
535
        c->pix_abs8x8     = pix_abs8x8_mmx;
536
        c->pix_abs8x8_x2  = pix_abs8x8_x2_mmx;
537
        c->pix_abs8x8_y2  = pix_abs8x8_y2_mmx;
538
        c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
539

    
540
        c->put_pixels_tab[0][0] = put_pixels16_mmx;
541
        c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
542
        c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
543
        c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
544

    
545
        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
546
        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
547
        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
548
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
549

    
550
        c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
551
        c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
552
        c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
553
        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
554

    
555
        c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
556
        c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
557
        c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
558
        c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
559

    
560
        c->put_pixels_tab[1][0] = put_pixels8_mmx;
561
        c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
562
        c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
563
        c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
564

    
565
        c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
566
        c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
567
        c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
568
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
569

    
570
        c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
571
        c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
572
        c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
573
        c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
574

    
575
        c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
576
        c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
577
        c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
578
        c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
579
        
580
        c->add_bytes= add_bytes_mmx;
581
        c->diff_bytes= diff_bytes_mmx;
582

    
583
        if (mm_flags & MM_MMXEXT) {
584
            c->pix_abs16x16     = pix_abs16x16_mmx2;
585
            c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx2;
586
            c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx2;
587
            c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
588

    
589
            c->pix_abs8x8     = pix_abs8x8_mmx2;
590
            c->pix_abs8x8_x2  = pix_abs8x8_x2_mmx2;
591
            c->pix_abs8x8_y2  = pix_abs8x8_y2_mmx2;
592
            c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
593

    
594
            c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
595
            c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
596
            c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
597
            c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
598

    
599
            c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
600
            c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
601
            c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
602
            c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
603

    
604
            c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
605
            c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
606
            c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
607
            c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
608

    
609
            c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
610
            c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
611
            c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
612
            c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
613
        } else if (mm_flags & MM_3DNOW) {
614
            c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
615
            c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
616
            c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
617
            c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
618

    
619
            c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
620
            c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
621
            c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
622
            c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
623

    
624
            c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
625
            c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
626
            c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
627
            c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
628

    
629
            c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
630
            c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
631
            c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
632
            c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
633
        }
634
    }
635

    
636
#if 0
637
    // for speed testing
638
    get_pixels = just_return;
639
    put_pixels_clamped = just_return;
640
    add_pixels_clamped = just_return;
641

642
    pix_abs16x16 = just_return;
643
    pix_abs16x16_x2 = just_return;
644
    pix_abs16x16_y2 = just_return;
645
    pix_abs16x16_xy2 = just_return;
646

647
    put_pixels_tab[0] = just_return;
648
    put_pixels_tab[1] = just_return;
649
    put_pixels_tab[2] = just_return;
650
    put_pixels_tab[3] = just_return;
651

652
    put_no_rnd_pixels_tab[0] = just_return;
653
    put_no_rnd_pixels_tab[1] = just_return;
654
    put_no_rnd_pixels_tab[2] = just_return;
655
    put_no_rnd_pixels_tab[3] = just_return;
656

657
    avg_pixels_tab[0] = just_return;
658
    avg_pixels_tab[1] = just_return;
659
    avg_pixels_tab[2] = just_return;
660
    avg_pixels_tab[3] = just_return;
661

662
    avg_no_rnd_pixels_tab[0] = just_return;
663
    avg_no_rnd_pixels_tab[1] = just_return;
664
    avg_no_rnd_pixels_tab[2] = just_return;
665
    avg_no_rnd_pixels_tab[3] = just_return;
666

667
    //av_fdct = just_return;
668
    //ff_idct = just_return;
669
#endif
670
}
671

    
672
/* remove any non bit exact operation (testing purpose). NOTE that
673
   this function should be kept as small as possible because it is
674
   always difficult to test automatically non bit exact cases. */
675
void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
676
{
677
    if (mm_flags & MM_MMX) {
678
        /* MMX2 & 3DNOW */
679
        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
680
        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
681
        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
682
        c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
683
        c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
684
        c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
685

    
686
        if (mm_flags & MM_MMXEXT) {
687
            c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
688
            c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
689
            c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
690
            c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
691
            c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
692
            c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
693
        }
694
    }
695
}