Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ e7fce5e9

History | View | Annotate | Download (17.5 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20
 */
21

    
22
#include "../dsputil.h"
23
#include "../simple_idct.h"
24

    
25
int mm_flags; /* multimedia extension flags */
26

    
27
int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28
int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29
int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31

    
32
int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33
int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34
int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36

    
37
int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38
int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39
int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41

    
42
int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43
int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44
int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46

    
47
/* external functions, from idct_mmx.c */
48
void ff_mmx_idct(DCTELEM *block);
49
void ff_mmxext_idct(DCTELEM *block);
50

    
51
/* pixel operations */
52
static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53
static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54
static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
55

    
56
#define JUMPALIGN() __asm __volatile (".balign 8"::)
57
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
58

    
59
#define MOVQ_WONE(regd) \
60
    __asm __volatile ( \
61
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
62
    "psrlw $15, %%" #regd ::)
63

    
64
#define MOVQ_BFE(regd) \
65
    __asm __volatile ( \
66
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
67
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
68

    
69
#ifndef PIC
70
#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
71
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
72
#else
73
// for shared library it's better to use this way for accessing constants
74
// pcmpeqd -> -1
75
#define MOVQ_BONE(regd) \
76
    __asm __volatile ( \
77
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78
    "psrlw $15, %%" #regd " \n\t" \
79
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
80

    
81
#define MOVQ_WTWO(regd) \
82
    __asm __volatile ( \
83
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
84
    "psrlw $15, %%" #regd " \n\t" \
85
    "psllw $1, %%" #regd " \n\t"::)
86

    
87
#endif
88

    
89
// using regr as temporary and for the output result
90
// first argument is unmodifed and second is trashed
91
// regfe is supposed to contain 0xfefefefefefefefe
92
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
93
    "movq " #rega ", " #regr "        \n\t"\
94
    "pand " #regb ", " #regr "        \n\t"\
95
    "pxor " #rega ", " #regb "        \n\t"\
96
    "pand " #regfe "," #regb "        \n\t"\
97
    "psrlq $1, " #regb "         \n\t"\
98
    "paddb " #regb ", " #regr "        \n\t"
99

    
100
#define PAVGB_MMX(rega, regb, regr, regfe) \
101
    "movq " #rega ", " #regr "        \n\t"\
102
    "por  " #regb ", " #regr "        \n\t"\
103
    "pxor " #rega ", " #regb "        \n\t"\
104
    "pand " #regfe "," #regb "        \n\t"\
105
    "psrlq $1, " #regb "        \n\t"\
106
    "psubb " #regb ", " #regr "        \n\t"
107

    
108
// mm6 is supposed to contain 0xfefefefefefefefe
109
#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
110
    "movq " #rega ", " #regr "        \n\t"\
111
    "movq " #regc ", " #regp "        \n\t"\
112
    "pand " #regb ", " #regr "        \n\t"\
113
    "pand " #regd ", " #regp "        \n\t"\
114
    "pxor " #rega ", " #regb "        \n\t"\
115
    "pxor " #regc ", " #regd "        \n\t"\
116
    "pand %%mm6, " #regb "        \n\t"\
117
    "pand %%mm6, " #regd "        \n\t"\
118
    "psrlq $1, " #regb "         \n\t"\
119
    "psrlq $1, " #regd "         \n\t"\
120
    "paddb " #regb ", " #regr "        \n\t"\
121
    "paddb " #regd ", " #regp "        \n\t"
122

    
123
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
124
    "movq " #rega ", " #regr "        \n\t"\
125
    "movq " #regc ", " #regp "        \n\t"\
126
    "por  " #regb ", " #regr "        \n\t"\
127
    "por  " #regd ", " #regp "        \n\t"\
128
    "pxor " #rega ", " #regb "        \n\t"\
129
    "pxor " #regc ", " #regd "        \n\t"\
130
    "pand %%mm6, " #regb "             \n\t"\
131
    "pand %%mm6, " #regd "             \n\t"\
132
    "psrlq $1, " #regd "        \n\t"\
133
    "psrlq $1, " #regb "        \n\t"\
134
    "psubb " #regb ", " #regr "        \n\t"\
135
    "psubb " #regd ", " #regp "        \n\t"
136

    
137
/***********************************/
138
/* MMX no rounding */
139
#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
140
#define SET_RND  MOVQ_WONE
141
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
142
#define PAVGB(a, b, c, e)                PAVGB_MMX_NO_RND(a, b, c, e)
143

    
144
#include "dsputil_mmx_rnd.h"
145

    
146
#undef DEF
147
#undef SET_RND
148
#undef PAVGBP
149
#undef PAVGB
150
/***********************************/
151
/* MMX rounding */
152

    
153
#define DEF(x, y) x ## _ ## y ##_mmx
154
#define SET_RND  MOVQ_WTWO
155
#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
156
#define PAVGB(a, b, c, e)                PAVGB_MMX(a, b, c, e)
157

    
158
#include "dsputil_mmx_rnd.h"
159

    
160
#undef DEF
161
#undef SET_RND
162
#undef PAVGBP
163
#undef PAVGB
164

    
165
/***********************************/
166
/* 3Dnow specific */
167

    
168
#define DEF(x) x ## _3dnow
169
/* for Athlons PAVGUSB is prefered */
170
#define PAVGB "pavgusb"
171

    
172
#include "dsputil_mmx_avg.h"
173

    
174
#undef DEF
175
#undef PAVGB
176

    
177
/***********************************/
178
/* MMX2 specific */
179

    
180
#define DEF(x) x ## _mmx2
181

    
182
/* Introduced only in MMX2 set */
183
#define PAVGB "pavgb"
184

    
185
#include "dsputil_mmx_avg.h"
186

    
187
#undef DEF
188
#undef PAVGB
189

    
190
/***********************************/
191
/* standard MMX */
192

    
193
static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
194
{
195
    asm volatile(
196
        "movl $-128, %%eax        \n\t"
197
        "pxor %%mm7, %%mm7        \n\t"
198
        ".balign 16                \n\t"
199
        "1:                        \n\t"
200
        "movq (%0), %%mm0        \n\t"
201
        "movq (%0, %2), %%mm2        \n\t"
202
        "movq %%mm0, %%mm1        \n\t"
203
        "movq %%mm2, %%mm3        \n\t"
204
        "punpcklbw %%mm7, %%mm0        \n\t"
205
        "punpckhbw %%mm7, %%mm1        \n\t"
206
        "punpcklbw %%mm7, %%mm2        \n\t"
207
        "punpckhbw %%mm7, %%mm3        \n\t"
208
        "movq %%mm0, (%1, %%eax)\n\t"
209
        "movq %%mm1, 8(%1, %%eax)\n\t"
210
        "movq %%mm2, 16(%1, %%eax)\n\t"
211
        "movq %%mm3, 24(%1, %%eax)\n\t"
212
        "addl %3, %0                \n\t"
213
        "addl $32, %%eax        \n\t"
214
        "js 1b                        \n\t"
215
        : "+r" (pixels)
216
        : "r" (block+64), "r" (line_size), "r" (line_size*2)
217
        : "%eax"
218
    );
219
}
220

    
221
static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
222
{
223
    asm volatile(
224
        "pxor %%mm7, %%mm7        \n\t"
225
        "movl $-128, %%eax        \n\t"
226
        ".balign 16                \n\t"
227
        "1:                        \n\t"
228
        "movq (%0), %%mm0        \n\t"
229
        "movq (%1), %%mm2        \n\t"
230
        "movq %%mm0, %%mm1        \n\t"
231
        "movq %%mm2, %%mm3        \n\t"
232
        "punpcklbw %%mm7, %%mm0        \n\t"
233
        "punpckhbw %%mm7, %%mm1        \n\t"
234
        "punpcklbw %%mm7, %%mm2        \n\t"
235
        "punpckhbw %%mm7, %%mm3        \n\t"
236
        "psubw %%mm2, %%mm0        \n\t"
237
        "psubw %%mm3, %%mm1        \n\t"
238
        "movq %%mm0, (%2, %%eax)\n\t"
239
        "movq %%mm1, 8(%2, %%eax)\n\t"
240
        "addl %3, %0                \n\t"
241
        "addl %3, %1                \n\t"
242
        "addl $16, %%eax        \n\t"
243
        "jnz 1b                        \n\t"
244
        : "+r" (s1), "+r" (s2)
245
        : "r" (block+64), "r" (stride)
246
        : "%eax"
247
    );
248
}
249

    
250
static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
251
{
252
    const DCTELEM *p;
253
    UINT8 *pix;
254

    
255
    /* read the pixels */
256
    p = block;
257
    pix = pixels;
258
    /* unrolled loop */
259
        __asm __volatile(
260
                "movq        %3, %%mm0\n\t"
261
                "movq        8%3, %%mm1\n\t"
262
                "movq        16%3, %%mm2\n\t"
263
                "movq        24%3, %%mm3\n\t"
264
                "movq        32%3, %%mm4\n\t"
265
                "movq        40%3, %%mm5\n\t"
266
                "movq        48%3, %%mm6\n\t"
267
                "movq        56%3, %%mm7\n\t"
268
                "packuswb %%mm1, %%mm0\n\t"
269
                "packuswb %%mm3, %%mm2\n\t"
270
                "packuswb %%mm5, %%mm4\n\t"
271
                "packuswb %%mm7, %%mm6\n\t"
272
                "movq        %%mm0, (%0)\n\t"
273
                "movq        %%mm2, (%0, %1)\n\t"
274
                "movq        %%mm4, (%0, %1, 2)\n\t"
275
                "movq        %%mm6, (%0, %2)\n\t"
276
                ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
277
                :"memory");
278
        pix += line_size*4;
279
        p += 32;
280

    
281
    // if here would be an exact copy of the code above
282
    // compiler would generate some very strange code
283
    // thus using "r"
284
    __asm __volatile(
285
            "movq        (%3), %%mm0\n\t"
286
            "movq        8(%3), %%mm1\n\t"
287
            "movq        16(%3), %%mm2\n\t"
288
            "movq        24(%3), %%mm3\n\t"
289
            "movq        32(%3), %%mm4\n\t"
290
            "movq        40(%3), %%mm5\n\t"
291
            "movq        48(%3), %%mm6\n\t"
292
            "movq        56(%3), %%mm7\n\t"
293
            "packuswb %%mm1, %%mm0\n\t"
294
            "packuswb %%mm3, %%mm2\n\t"
295
            "packuswb %%mm5, %%mm4\n\t"
296
            "packuswb %%mm7, %%mm6\n\t"
297
            "movq        %%mm0, (%0)\n\t"
298
            "movq        %%mm2, (%0, %1)\n\t"
299
            "movq        %%mm4, (%0, %1, 2)\n\t"
300
            "movq        %%mm6, (%0, %2)\n\t"
301
            ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
302
            :"memory");
303
}
304

    
305
static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
306
{
307
    const DCTELEM *p;
308
    UINT8 *pix;
309
    int i;
310

    
311
    /* read the pixels */
312
    p = block;
313
    pix = pixels;
314
    MOVQ_ZERO(mm7);
315
    i = 4;
316
    do {
317
        __asm __volatile(
318
                "movq        (%2), %%mm0\n\t"
319
                "movq        8(%2), %%mm1\n\t"
320
                "movq        16(%2), %%mm2\n\t"
321
                "movq        24(%2), %%mm3\n\t"
322
                "movq        %0, %%mm4\n\t"
323
                "movq        %1, %%mm6\n\t"
324
                "movq        %%mm4, %%mm5\n\t"
325
                "punpcklbw %%mm7, %%mm4\n\t"
326
                "punpckhbw %%mm7, %%mm5\n\t"
327
                "paddsw        %%mm4, %%mm0\n\t"
328
                "paddsw        %%mm5, %%mm1\n\t"
329
                "movq        %%mm6, %%mm5\n\t"
330
                "punpcklbw %%mm7, %%mm6\n\t"
331
                "punpckhbw %%mm7, %%mm5\n\t"
332
                "paddsw        %%mm6, %%mm2\n\t"
333
                "paddsw        %%mm5, %%mm3\n\t"
334
                "packuswb %%mm1, %%mm0\n\t"
335
                "packuswb %%mm3, %%mm2\n\t"
336
                "movq        %%mm0, %0\n\t"
337
                "movq        %%mm2, %1\n\t"
338
                :"+m"(*pix), "+m"(*(pix+line_size))
339
                :"r"(p)
340
                :"memory");
341
        pix += line_size*2;
342
        p += 16;
343
    } while (--i);
344
}
345

    
346
static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
347
{
348
    __asm __volatile(
349
         "lea (%3, %3), %%eax                \n\t"
350
         ".balign 8                        \n\t"
351
         "1:                                \n\t"
352
         "movq (%1), %%mm0                \n\t"
353
         "movq (%1, %3), %%mm1                \n\t"
354
              "movq %%mm0, (%2)                \n\t"
355
         "movq %%mm1, (%2, %3)                \n\t"
356
         "addl %%eax, %1                \n\t"
357
         "addl %%eax, %2                       \n\t"
358
         "movq (%1), %%mm0                \n\t"
359
         "movq (%1, %3), %%mm1                \n\t"
360
         "movq %%mm0, (%2)                \n\t"
361
         "movq %%mm1, (%2, %3)                \n\t"
362
         "addl %%eax, %1                \n\t"
363
         "addl %%eax, %2                       \n\t"
364
         "subl $4, %0                        \n\t"
365
         "jnz 1b                        \n\t"
366
         : "+g"(h), "+r" (pixels),  "+r" (block)
367
         : "r"(line_size)
368
         : "%eax", "memory"
369
        );
370
}
371

    
372
static void clear_blocks_mmx(DCTELEM *blocks)
373
{
374
    __asm __volatile(
375
                "pxor %%mm7, %%mm7                \n\t"
376
                "movl $-128*6, %%eax                \n\t"
377
                "1:                                \n\t"
378
                "movq %%mm7, (%0, %%eax)        \n\t"
379
                "movq %%mm7, 8(%0, %%eax)        \n\t"
380
                "movq %%mm7, 16(%0, %%eax)        \n\t"
381
                "movq %%mm7, 24(%0, %%eax)        \n\t"
382
                "addl $32, %%eax                \n\t"
383
                " js 1b                                \n\t"
384
                : : "r" (((int)blocks)+128*6)
385
                : "%eax"
386
        );
387
}
388

    
389
#if 0
390
static void just_return() { return; }
391
#endif
392

    
393
void dsputil_init_mmx(void)
394
{
395
    mm_flags = mm_support();
396
#if 1
397
    printf("libavcodec: CPU flags:");
398
    if (mm_flags & MM_MMX)
399
        printf(" mmx");
400
    if (mm_flags & MM_MMXEXT)
401
        printf(" mmxext");
402
    if (mm_flags & MM_3DNOW)
403
        printf(" 3dnow");
404
    if (mm_flags & MM_SSE)
405
        printf(" sse");
406
    if (mm_flags & MM_SSE2)
407
        printf(" sse2");
408
    printf("\n");
409
#endif
410

    
411
    if (mm_flags & MM_MMX) {
412
        get_pixels = get_pixels_mmx;
413
        diff_pixels = diff_pixels_mmx;
414
        put_pixels_clamped = put_pixels_clamped_mmx;
415
        add_pixels_clamped = add_pixels_clamped_mmx;
416
        clear_blocks= clear_blocks_mmx;
417

    
418
        pix_abs16x16     = pix_abs16x16_mmx;
419
        pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
420
        pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
421
        pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
422
        pix_abs8x8    = pix_abs8x8_mmx;
423
        pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
424
        pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
425
        pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
426

    
427
        put_pixels_tab[0] = put_pixels_mmx;
428
        put_pixels_tab[1] = put_pixels_x2_mmx;
429
        put_pixels_tab[2] = put_pixels_y2_mmx;
430
        put_pixels_tab[3] = put_pixels_xy2_mmx;
431

    
432
        put_no_rnd_pixels_tab[0] = put_pixels_mmx;
433
        put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
434
        put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
435
        put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
436

    
437
        avg_pixels_tab[0] = avg_pixels_mmx;
438
        avg_pixels_tab[1] = avg_pixels_x2_mmx;
439
        avg_pixels_tab[2] = avg_pixels_y2_mmx;
440
        avg_pixels_tab[3] = avg_pixels_xy2_mmx;
441

    
442
        avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
443
        avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
444
        avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
445
        avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
446

    
447
        if (mm_flags & MM_MMXEXT) {
448
            pix_abs16x16    = pix_abs16x16_mmx2;
449
            pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
450
            pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
451
            pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
452

    
453
            pix_abs8x8    = pix_abs8x8_mmx2;
454
            pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
455
            pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
456
            pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
457

    
458
            put_pixels_tab[1] = put_pixels_x2_mmx2;
459
            put_pixels_tab[2] = put_pixels_y2_mmx2;
460
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
461
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
462

    
463
            avg_pixels_tab[0] = avg_pixels_mmx2;
464
            avg_pixels_tab[1] = avg_pixels_x2_mmx2;
465
            avg_pixels_tab[2] = avg_pixels_y2_mmx2;
466
            avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
467
        } else if (mm_flags & MM_3DNOW) {
468
            put_pixels_tab[1] = put_pixels_x2_3dnow;
469
            put_pixels_tab[2] = put_pixels_y2_3dnow;
470
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
471
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
472

    
473
            avg_pixels_tab[0] = avg_pixels_3dnow;
474
            avg_pixels_tab[1] = avg_pixels_x2_3dnow;
475
            avg_pixels_tab[2] = avg_pixels_y2_3dnow;
476
            avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
477
        }
478

    
479
        /* idct */
480
        if (mm_flags & MM_MMXEXT) {
481
            ff_idct = ff_mmxext_idct;
482
        } else {
483
            ff_idct = ff_mmx_idct;
484
        }
485
#ifdef SIMPLE_IDCT
486
//        ff_idct = simple_idct;
487
        ff_idct = simple_idct_mmx;
488
#endif
489
    }
490

    
491
#if 0
492
    // for speed testing
493
    get_pixels = just_return;
494
    put_pixels_clamped = just_return;
495
    add_pixels_clamped = just_return;
496

497
    pix_abs16x16 = just_return;
498
    pix_abs16x16_x2 = just_return;
499
    pix_abs16x16_y2 = just_return;
500
    pix_abs16x16_xy2 = just_return;
501

502
    put_pixels_tab[0] = just_return;
503
    put_pixels_tab[1] = just_return;
504
    put_pixels_tab[2] = just_return;
505
    put_pixels_tab[3] = just_return;
506

507
    put_no_rnd_pixels_tab[0] = just_return;
508
    put_no_rnd_pixels_tab[1] = just_return;
509
    put_no_rnd_pixels_tab[2] = just_return;
510
    put_no_rnd_pixels_tab[3] = just_return;
511

512
    avg_pixels_tab[0] = just_return;
513
    avg_pixels_tab[1] = just_return;
514
    avg_pixels_tab[2] = just_return;
515
    avg_pixels_tab[3] = just_return;
516

517
    avg_no_rnd_pixels_tab[0] = just_return;
518
    avg_no_rnd_pixels_tab[1] = just_return;
519
    avg_no_rnd_pixels_tab[2] = just_return;
520
    avg_no_rnd_pixels_tab[3] = just_return;
521

522
    //av_fdct = just_return;
523
    //ff_idct = just_return;
524
#endif
525
}
526

    
527
void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
528

    
529
/**
530
 * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT
531
 */ 
532
void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){
533
    if(   block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0
534
       && block[8]==0   && block[9]==0 && block[12]==0 && block[13]==0){
535
        int16_t tmp[64];
536
        int i;
537

    
538
        for(i=0; i<64; i++)
539
            tmp[i]= block[i];
540
        for(i=0; i<64; i++)
541
            block[i]= tmp[block_permute_op(i)];
542
        
543
        simple_idct_put(dest, line_size, block);
544
    }
545
    else
546
        gen_idct_put(dest, line_size, block);
547
}
548

    
549
/* remove any non bit exact operation (testing purpose). NOTE that
550
   this function should be kept as small as possible because it is
551
   always difficult to test automatically non bit exact cases. */
552
void dsputil_set_bit_exact_mmx(void)
553
{
554
    if (mm_flags & MM_MMX) {
555
        if (mm_flags & MM_MMXEXT) {
556
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
557
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
558
            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
559

    
560
            pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
561
            pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
562
            pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
563
            pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
564
            pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
565
            pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
566
        } else if (mm_flags & MM_3DNOW) {
567
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
568
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
569
            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
570
        }
571
#ifdef SIMPLE_IDCT
572
        if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx)
573
            ff_idct_put= bit_exact_idct_put;
574
#endif
575
    }
576
}