Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / dsputil_mmx.c @ ec9a41f5

History | View | Annotate | Download (28.4 KB)

1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Gerard Lantau.
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
 *
19
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20
 */
21

    
22
#include "../dsputil.h"
23
#include "../simple_idct.h"
24
#include "../mangle.h"
25

    
26
int mm_flags; /* multimedia extension flags */
27

    
28
int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29
int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
32

    
33
int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34
int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
37

    
38
int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39
int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
42

    
43
int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44
int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
47

    
48
/* external functions, from idct_mmx.c */
49
void ff_mmx_idct(DCTELEM *block);
50
void ff_mmxext_idct(DCTELEM *block);
51

    
52
/* pixel operations */
53
static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
54
static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
55
static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
56
//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
57
//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
58

    
59
#define JUMPALIGN() __asm __volatile (".balign 8"::)
60
#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
61

    
62
#ifndef PIC
63
#define MOVQ_WONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
64
#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
65
#define MOVQ_BONE(regd)  "movq "MANGLE(mm_bone)", "#regd" \n\t"
66
#else
67
// for shared library it's better to use this way for accessing constants
68
// pcmpeqd -> -1
69
#define MOVQ_WONE(regd) \
70
    __asm __volatile ( \
71
       "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
72
       "psrlw $15, %%" #regd ::)
73

    
74
#define MOVQ_WTWO(regd) \
75
    __asm __volatile ( \
76
       "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
77
       "psrlw $15, %%" #regd " \n\t" \
78
       "psllw $1, %%" #regd ::)
79

    
80
#define MOVQ_BONE(regd) \
81
       "pcmpeqd " #regd ", " #regd " \n\t" \
82
       "psrlw $15, " #regd " \n\t"\
83
       "packuswb " #regd ", " #regd " \n\t"
84
#endif
85

    
86

    
87
/***********************************/
88
/* 3Dnow specific */
89

    
90
#define DEF(x) x ## _3dnow
91
/* for Athlons PAVGUSB is prefered */
92
#define PAVGB "pavgusb"
93

    
94
#include "dsputil_mmx_avg.h"
95

    
96
#undef DEF
97
#undef PAVGB
98

    
99
/***********************************/
100
/* MMX2 specific */
101

    
102
#define DEF(x) x ## _mmx2
103

    
104
/* Introduced only in MMX2 set */
105
#define PAVGB "pavgb"
106

    
107
#include "dsputil_mmx_avg.h"
108

    
109
#undef DEF
110
#undef PAVGB
111

    
112
/***********************************/
113
/* standard MMX */
114

    
115
static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
116
{
117
    asm volatile(
118
        "movl $-128, %%eax        \n\t"
119
        "pxor %%mm7, %%mm7        \n\t"
120
        ".balign 16                \n\t"
121
        "1:                        \n\t"
122
        "movq (%0), %%mm0        \n\t"
123
        "movq (%0, %2), %%mm2        \n\t"
124
        "movq %%mm0, %%mm1        \n\t"
125
        "movq %%mm2, %%mm3        \n\t"
126
        "punpcklbw %%mm7, %%mm0        \n\t"
127
        "punpckhbw %%mm7, %%mm1        \n\t"
128
        "punpcklbw %%mm7, %%mm2        \n\t"
129
        "punpckhbw %%mm7, %%mm3        \n\t"
130
        "movq %%mm0, (%1, %%eax)\n\t"
131
        "movq %%mm1, 8(%1, %%eax)\n\t"
132
        "movq %%mm2, 16(%1, %%eax)\n\t"
133
        "movq %%mm3, 24(%1, %%eax)\n\t"
134
        "addl %3, %0                \n\t"
135
        "addl $32, %%eax        \n\t"
136
        "js 1b                        \n\t"
137
        : "+r" (pixels)
138
        : "r" (block+64), "r" (line_size), "r" (line_size*2)
139
        : "%eax"
140
    );
141
}
142

    
143
static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
144
{
145
    asm volatile(
146
        "pxor %%mm7, %%mm7        \n\t"
147
        "movl $-128, %%eax        \n\t"
148
        ".balign 16                \n\t"
149
        "1:                        \n\t"
150
        "movq (%0), %%mm0        \n\t"
151
        "movq (%1), %%mm2        \n\t"
152
        "movq %%mm0, %%mm1        \n\t"
153
        "movq %%mm2, %%mm3        \n\t"
154
        "punpcklbw %%mm7, %%mm0        \n\t"
155
        "punpckhbw %%mm7, %%mm1        \n\t"
156
        "punpcklbw %%mm7, %%mm2        \n\t"
157
        "punpckhbw %%mm7, %%mm3        \n\t"
158
        "psubw %%mm2, %%mm0        \n\t"
159
        "psubw %%mm3, %%mm1        \n\t"
160
        "movq %%mm0, (%2, %%eax)\n\t"
161
        "movq %%mm1, 8(%2, %%eax)\n\t"
162
        "addl %3, %0                \n\t"
163
        "addl %3, %1                \n\t"
164
        "addl $16, %%eax        \n\t"
165
        "jnz 1b                        \n\t"
166
        : "+r" (s1), "+r" (s2)
167
        : "r" (block+64), "r" (stride)
168
        : "%eax"
169
    );
170
}
171

    
172
static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
173
{
174
    const DCTELEM *p;
175
    UINT8 *pix;
176

    
177
    /* read the pixels */
178
    p = block;
179
    pix = pixels;
180
    /* unrolled loop */
181
        __asm __volatile(
182
                "movq        %3, %%mm0\n\t"
183
                "movq        8%3, %%mm1\n\t"
184
                "movq        16%3, %%mm2\n\t"
185
                "movq        24%3, %%mm3\n\t"
186
                "movq        32%3, %%mm4\n\t"
187
                "movq        40%3, %%mm5\n\t"
188
                "movq        48%3, %%mm6\n\t"
189
                "movq        56%3, %%mm7\n\t"
190
                "packuswb %%mm1, %%mm0\n\t"
191
                "packuswb %%mm3, %%mm2\n\t"
192
                "packuswb %%mm5, %%mm4\n\t"
193
                "packuswb %%mm7, %%mm6\n\t"
194
                "movq        %%mm0, (%0)\n\t"
195
                "movq        %%mm2, (%0, %1)\n\t"
196
                "movq        %%mm4, (%0, %1, 2)\n\t"
197
                "movq        %%mm6, (%0, %2)\n\t"
198
                ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
199
                :"memory");
200
        pix += line_size*4;
201
        p += 32;
202

    
203
    // if here would be an exact copy of the code above
204
    // compiler would generate some very strange code
205
    // thus using "r"
206
    __asm __volatile(
207
            "movq        (%3), %%mm0\n\t"
208
            "movq        8(%3), %%mm1\n\t"
209
            "movq        16(%3), %%mm2\n\t"
210
            "movq        24(%3), %%mm3\n\t"
211
            "movq        32(%3), %%mm4\n\t"
212
            "movq        40(%3), %%mm5\n\t"
213
            "movq        48(%3), %%mm6\n\t"
214
            "movq        56(%3), %%mm7\n\t"
215
            "packuswb %%mm1, %%mm0\n\t"
216
            "packuswb %%mm3, %%mm2\n\t"
217
            "packuswb %%mm5, %%mm4\n\t"
218
            "packuswb %%mm7, %%mm6\n\t"
219
            "movq        %%mm0, (%0)\n\t"
220
            "movq        %%mm2, (%0, %1)\n\t"
221
            "movq        %%mm4, (%0, %1, 2)\n\t"
222
            "movq        %%mm6, (%0, %2)\n\t"
223
            ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
224
            :"memory");
225
}
226

    
227
static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
228
{
229
    const DCTELEM *p;
230
    UINT8 *pix;
231
    int i;
232

    
233
    /* read the pixels */
234
    p = block;
235
    pix = pixels;
236
    MOVQ_ZERO(mm7);
237
    i = 4;
238
    do {
239
        __asm __volatile(
240
                "movq        (%2), %%mm0\n\t"
241
                "movq        8(%2), %%mm1\n\t"
242
                "movq        16(%2), %%mm2\n\t"
243
                "movq        24(%2), %%mm3\n\t"
244
                "movq        %0, %%mm4\n\t"
245
                "movq        %1, %%mm6\n\t"
246
                "movq        %%mm4, %%mm5\n\t"
247
                "punpcklbw %%mm7, %%mm4\n\t"
248
                "punpckhbw %%mm7, %%mm5\n\t"
249
                "paddsw        %%mm4, %%mm0\n\t"
250
                "paddsw        %%mm5, %%mm1\n\t"
251
                "movq        %%mm6, %%mm5\n\t"
252
                "punpcklbw %%mm7, %%mm6\n\t"
253
                "punpckhbw %%mm7, %%mm5\n\t"
254
                "paddsw        %%mm6, %%mm2\n\t"
255
                "paddsw        %%mm5, %%mm3\n\t"
256
                "packuswb %%mm1, %%mm0\n\t"
257
                "packuswb %%mm3, %%mm2\n\t"
258
                "movq        %%mm0, %0\n\t"
259
                "movq        %%mm2, %1\n\t"
260
                :"+m"(*pix), "+m"(*(pix+line_size))
261
                :"r"(p)
262
                :"memory");
263
        pix += line_size*2;
264
        p += 16;
265
    } while (--i);
266
}
267

    
268
static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
269
{
270
#if 0 //FIXME h==4 case
271
    asm volatile(
272
        "xorl %%eax, %%eax                \n\t"
273
        "movl %3, %%esi                        \n\t"
274
        "1:                                \n\t"
275
        "movq (%1, %%eax), %%mm0        \n\t"
276
        "movq %%mm0, (%0, %%eax)        \n\t"
277
        "addl %2, %%eax                        \n\t"
278
        "movq (%1, %%eax), %%mm0        \n\t"
279
        "movq %%mm0, (%0, %%eax)        \n\t"
280
        "addl %2, %%eax                        \n\t"
281
        "movq (%1, %%eax), %%mm0        \n\t"
282
        "movq %%mm0, (%0, %%eax)        \n\t"
283
        "addl %2, %%eax                        \n\t"
284
        "movq (%1, %%eax), %%mm0        \n\t"
285
        "movq %%mm0, (%0, %%eax)        \n\t"
286
        "addl %2, %%eax                        \n\t"
287
        "movq (%1, %%eax), %%mm0        \n\t"
288
        "movq %%mm0, (%0, %%eax)        \n\t"
289
        "addl %2, %%eax                        \n\t"
290
        "movq (%1, %%eax), %%mm0        \n\t"
291
        "movq %%mm0, (%0, %%eax)        \n\t"
292
        "addl %2, %%eax                        \n\t"
293
        "movq (%1, %%eax), %%mm0        \n\t"
294
        "movq %%mm0, (%0, %%eax)        \n\t"
295
        "addl %2, %%eax                        \n\t"
296
        "movq (%1, %%eax), %%mm0        \n\t"
297
        "movq %%mm0, (%0, %%eax)        \n\t"
298
        "addl %2, %%eax                        \n\t"
299
        "subl $8, %%esi                        \n\t"
300
        " jnz 1b                        \n\t"
301
    :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
302
    : "%eax", "%esi", "memory"
303
    );
304
#else
305
    asm volatile(
306
        "xorl %%eax, %%eax                \n\t"
307
        "movl %3, %%esi                        \n\t"
308
        "1:                                \n\t"
309
        "movq (%1, %%eax), %%mm0        \n\t"
310
        "movq %%mm0, (%0, %%eax)        \n\t"
311
        "addl %2, %%eax                        \n\t"
312
        "movq (%1, %%eax), %%mm0        \n\t"
313
        "movq %%mm0, (%0, %%eax)        \n\t"
314
        "addl %2, %%eax                        \n\t"
315
        "movq (%1, %%eax), %%mm0        \n\t"
316
        "movq %%mm0, (%0, %%eax)        \n\t"
317
        "addl %2, %%eax                        \n\t"
318
        "movq (%1, %%eax), %%mm0        \n\t"
319
        "movq %%mm0, (%0, %%eax)        \n\t"
320
        "addl %2, %%eax                        \n\t"
321
        "subl $4, %%esi                        \n\t"
322
        " jnz 1b                        \n\t"
323
    :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
324
    : "%eax", "%esi", "memory"
325
    );
326
#endif
327
}
328

    
329
static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
330
{
331
  UINT8 *p;
332
  const UINT8 *pix;
333
  p = block;
334
  pix = pixels;
335
  MOVQ_ZERO(mm7);
336
  MOVQ_WONE(mm4);
337
  JUMPALIGN();
338
  do {
339
    __asm __volatile(
340
        "movq        %1, %%mm0\n\t"
341
        "movq        1%1, %%mm1\n\t"
342
        "movq        %%mm0, %%mm2\n\t"
343
        "movq        %%mm1, %%mm3\n\t"
344
        "punpcklbw %%mm7, %%mm0\n\t"
345
        "punpcklbw %%mm7, %%mm1\n\t"
346
        "punpckhbw %%mm7, %%mm2\n\t"
347
        "punpckhbw %%mm7, %%mm3\n\t"
348
        "paddusw %%mm1, %%mm0\n\t"
349
        "paddusw %%mm3, %%mm2\n\t"
350
        "paddusw %%mm4, %%mm0\n\t"
351
        "paddusw %%mm4, %%mm2\n\t"
352
        "psrlw        $1, %%mm0\n\t"
353
        "psrlw        $1, %%mm2\n\t"
354
        "packuswb  %%mm2, %%mm0\n\t"
355
        "movq        %%mm0, %0\n\t"
356
        :"=m"(*p)
357
        :"m"(*pix)
358
        :"memory");
359
   pix += line_size; p += line_size;
360
  } while (--h);
361
}
362

    
363
static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
364
{
365
  UINT8 *p;
366
  const UINT8 *pix;
367
  p = block;
368
  pix = pixels;
369
  MOVQ_ZERO(mm7);
370
  MOVQ_WONE(mm4);
371
  JUMPALIGN();
372
  do {
373
    __asm __volatile(
374
        "movq        %1, %%mm0\n\t"
375
        "movq        %2, %%mm1\n\t"
376
        "movq        %%mm0, %%mm2\n\t"
377
        "movq        %%mm1, %%mm3\n\t"
378
        "punpcklbw %%mm7, %%mm0\n\t"
379
        "punpcklbw %%mm7, %%mm1\n\t"
380
        "punpckhbw %%mm7, %%mm2\n\t"
381
        "punpckhbw %%mm7, %%mm3\n\t"
382
        "paddusw %%mm1, %%mm0\n\t"
383
        "paddusw %%mm3, %%mm2\n\t"
384
        "paddusw %%mm4, %%mm0\n\t"
385
        "paddusw %%mm4, %%mm2\n\t"
386
        "psrlw        $1, %%mm0\n\t"
387
        "psrlw        $1, %%mm2\n\t"
388
        "packuswb  %%mm2, %%mm0\n\t"
389
        "movq        %%mm0, %0\n\t"
390
        :"=m"(*p)
391
        :"m"(*pix),
392
         "m"(*(pix+line_size))
393
        :"memory");
394
   pix += line_size;
395
   p += line_size;
396
  } while (--h);
397
}
398

    
399
static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
400
{
401
  UINT8 *p;
402
  const UINT8 *pix;
403
  p = block;
404
  pix = pixels; // 1s
405
  MOVQ_ZERO(mm7);
406
  MOVQ_WTWO(mm6);
407
  JUMPALIGN();
408
  do {
409
    __asm __volatile(
410
        "movq        %1, %%mm0\n\t"
411
        "movq        %2, %%mm1\n\t"
412
        "movq        1%1, %%mm4\n\t"
413
        "movq        1%2, %%mm5\n\t"
414
        "movq        %%mm0, %%mm2\n\t"
415
        "movq        %%mm1, %%mm3\n\t"
416
        "punpcklbw %%mm7, %%mm0\n\t"
417
        "punpcklbw %%mm7, %%mm1\n\t"
418
        "punpckhbw %%mm7, %%mm2\n\t"
419
        "punpckhbw %%mm7, %%mm3\n\t"
420
        "paddusw %%mm1, %%mm0\n\t"
421
        "paddusw %%mm3, %%mm2\n\t"
422
        "movq        %%mm4, %%mm1\n\t"
423
        "movq        %%mm5, %%mm3\n\t"
424
        "punpcklbw %%mm7, %%mm4\n\t"
425
        "punpcklbw %%mm7, %%mm5\n\t"
426
        "punpckhbw %%mm7, %%mm1\n\t"
427
        "punpckhbw %%mm7, %%mm3\n\t"
428
        "paddusw %%mm5, %%mm4\n\t"
429
        "paddusw %%mm3, %%mm1\n\t"
430
        "paddusw %%mm6, %%mm4\n\t"
431
        "paddusw %%mm6, %%mm1\n\t"
432
        "paddusw %%mm4, %%mm0\n\t"
433
        "paddusw %%mm1, %%mm2\n\t"
434
        "psrlw        $2, %%mm0\n\t"
435
        "psrlw        $2, %%mm2\n\t"
436
        "packuswb  %%mm2, %%mm0\n\t"
437
        "movq        %%mm0, %0\n\t"
438
        :"=m"(*p)
439
        :"m"(*pix),
440
         "m"(*(pix+line_size))
441
        :"memory");
442
   pix += line_size;
443
   p += line_size;
444
  } while(--h);
445
}
446

    
447
static void   put_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
448
{
449
  UINT8  *p;
450
  const UINT8 *pix;
451
  p = block;
452
  pix = pixels;
453
  MOVQ_ZERO(mm7);
454
  do {
455
    __asm __volatile(
456
        "movq        %1, %%mm0\n\t"
457
        "movq        1%1, %%mm1\n\t"
458
        "movq        %%mm0, %%mm2\n\t"
459
        "movq        %%mm1, %%mm3\n\t"
460
        "punpcklbw %%mm7, %%mm0\n\t"
461
        "punpcklbw %%mm7, %%mm1\n\t"
462
        "punpckhbw %%mm7, %%mm2\n\t"
463
        "punpckhbw %%mm7, %%mm3\n\t"
464
        "paddusw %%mm1, %%mm0\n\t"
465
        "paddusw %%mm3, %%mm2\n\t"
466
        "psrlw        $1, %%mm0\n\t"
467
        "psrlw        $1, %%mm2\n\t"
468
        "packuswb  %%mm2, %%mm0\n\t"
469
        "movq        %%mm0, %0\n\t"
470
        :"=m"(*p)
471
        :"m"(*pix)
472
        :"memory");
473
   pix += line_size;
474
   p +=   line_size;
475
  } while (--h);
476
}
477

    
478
static void put_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
479
{
480
  UINT8  *p;
481
  const UINT8 *pix;
482
  p = block;
483
  pix = pixels;
484
  MOVQ_ZERO(mm7);
485
  JUMPALIGN();
486
  do {
487
    __asm __volatile(
488
        "movq        %1, %%mm0\n\t"
489
        "movq        %2, %%mm1\n\t"
490
        "movq        %%mm0, %%mm2\n\t"
491
        "movq        %%mm1, %%mm3\n\t"
492
        "punpcklbw %%mm7, %%mm0\n\t"
493
        "punpcklbw %%mm7, %%mm1\n\t"
494
        "punpckhbw %%mm7, %%mm2\n\t"
495
        "punpckhbw %%mm7, %%mm3\n\t"
496
        "paddusw %%mm1, %%mm0\n\t"
497
        "paddusw %%mm3, %%mm2\n\t"
498
        "psrlw        $1, %%mm0\n\t"
499
        "psrlw        $1, %%mm2\n\t"
500
        "packuswb  %%mm2, %%mm0\n\t"
501
        "movq        %%mm0, %0\n\t"
502
        :"=m"(*p)
503
        :"m"(*pix),
504
         "m"(*(pix+line_size))
505
        :"memory");
506
   pix += line_size;
507
   p +=   line_size;
508
  } while(--h);
509
}
510

    
511
static void   put_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
512
{
513
  UINT8  *p;
514
  const UINT8 *pix;
515
  p = block;
516
  pix = pixels;
517
  MOVQ_ZERO(mm7);
518
  MOVQ_WONE(mm6);
519
  JUMPALIGN();
520
  do {
521
    __asm __volatile(
522
        "movq        %1, %%mm0\n\t"
523
        "movq        %2, %%mm1\n\t"
524
        "movq        1%1, %%mm4\n\t"
525
        "movq        1%2, %%mm5\n\t"
526
        "movq        %%mm0, %%mm2\n\t"
527
        "movq        %%mm1, %%mm3\n\t"
528
        "punpcklbw %%mm7, %%mm0\n\t"
529
        "punpcklbw %%mm7, %%mm1\n\t"
530
        "punpckhbw %%mm7, %%mm2\n\t"
531
        "punpckhbw %%mm7, %%mm3\n\t"
532
        "paddusw %%mm1, %%mm0\n\t"
533
        "paddusw %%mm3, %%mm2\n\t"
534
        "movq        %%mm4, %%mm1\n\t"
535
        "movq        %%mm5, %%mm3\n\t"
536
        "punpcklbw %%mm7, %%mm4\n\t"
537
        "punpcklbw %%mm7, %%mm5\n\t"
538
        "punpckhbw %%mm7, %%mm1\n\t"
539
        "punpckhbw %%mm7, %%mm3\n\t"
540
        "paddusw %%mm5, %%mm4\n\t"
541
        "paddusw %%mm3, %%mm1\n\t"
542
        "paddusw %%mm6, %%mm4\n\t"
543
        "paddusw %%mm6, %%mm1\n\t"
544
        "paddusw %%mm4, %%mm0\n\t"
545
        "paddusw %%mm1, %%mm2\n\t"
546
        "psrlw        $2, %%mm0\n\t"
547
        "psrlw        $2, %%mm2\n\t"
548
        "packuswb  %%mm2, %%mm0\n\t"
549
        "movq        %%mm0, %0\n\t"
550
        :"=m"(*p)
551
        :"m"(*pix),
552
         "m"(*(pix+line_size))
553
        :"memory");
554
   pix += line_size;
555
   p +=   line_size;
556
  } while(--h);
557
}
558

    
559
static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
560
{
561
  UINT8  *p;
562
  const UINT8 *pix;
563
  p = block;
564
  pix = pixels;
565
  MOVQ_ZERO(mm7);
566
  MOVQ_WONE(mm6);
567
  JUMPALIGN();
568
  do {
569
    __asm __volatile(
570
        "movq        %0, %%mm0\n\t"
571
        "movq        %1, %%mm1\n\t"
572
        "movq        %%mm0, %%mm2\n\t"
573
        "movq        %%mm1, %%mm3\n\t"
574
        "punpcklbw %%mm7, %%mm0\n\t"
575
        "punpcklbw %%mm7, %%mm1\n\t"
576
        "punpckhbw %%mm7, %%mm2\n\t"
577
        "punpckhbw %%mm7, %%mm3\n\t"
578
        "paddusw %%mm1, %%mm0\n\t"
579
        "paddusw %%mm3, %%mm2\n\t"
580
        "paddusw %%mm6, %%mm0\n\t"
581
        "paddusw %%mm6, %%mm2\n\t"
582
        "psrlw        $1, %%mm0\n\t"
583
        "psrlw        $1, %%mm2\n\t"
584
        "packuswb  %%mm2, %%mm0\n\t"
585
        "movq        %%mm0, %0\n\t"
586
        :"+m"(*p)
587
        :"m"(*pix)
588
        :"memory");
589
   pix += line_size;
590
   p +=   line_size;
591
  }
592
  while (--h);
593
}
594

    
595
static void   avg_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
596
{
597
  UINT8  *p;
598
  const UINT8 *pix;
599
  p = block;
600
  pix = pixels;
601
  MOVQ_ZERO(mm7);
602
  MOVQ_WONE(mm6);
603
  JUMPALIGN();
604
  do {
605
    __asm __volatile(
606
        "movq        %1, %%mm1\n\t"
607
        "movq        %0, %%mm0\n\t"
608
        "movq        1%1, %%mm4\n\t"
609
        "movq        %%mm0, %%mm2\n\t"
610
        "movq        %%mm1, %%mm3\n\t"
611
        "movq        %%mm4, %%mm5\n\t"
612
        "punpcklbw %%mm7, %%mm1\n\t"
613
        "punpckhbw %%mm7, %%mm3\n\t"
614
        "punpcklbw %%mm7, %%mm4\n\t"
615
        "punpckhbw %%mm7, %%mm5\n\t"
616
        "punpcklbw %%mm7, %%mm0\n\t"
617
        "punpckhbw %%mm7, %%mm2\n\t"
618
        "paddusw %%mm4, %%mm1\n\t"
619
        "paddusw %%mm5, %%mm3\n\t"
620
        "paddusw %%mm6, %%mm1\n\t"
621
        "paddusw %%mm6, %%mm3\n\t"
622
        "psrlw        $1, %%mm1\n\t"
623
        "psrlw        $1, %%mm3\n\t"
624
        "paddusw %%mm6, %%mm0\n\t"
625
        "paddusw %%mm6, %%mm2\n\t"
626
        "paddusw %%mm1, %%mm0\n\t"
627
        "paddusw %%mm3, %%mm2\n\t"
628
        "psrlw        $1, %%mm0\n\t"
629
        "psrlw        $1, %%mm2\n\t"
630
        "packuswb  %%mm2, %%mm0\n\t"
631
        "movq        %%mm0, %0\n\t"
632
        :"+m"(*p)
633
        :"m"(*pix)
634
        :"memory");
635
   pix += line_size;
636
   p +=   line_size;
637
  } while (--h);
638
}
639

    
640
static void   avg_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
641
{
642
  UINT8  *p;
643
  const UINT8 *pix;
644
  p = block;
645
  pix = pixels;
646
  MOVQ_ZERO(mm7);
647
  MOVQ_WONE(mm6);
648
  JUMPALIGN();
649
  do {
650
    __asm __volatile(
651
        "movq        %1, %%mm1\n\t"
652
        "movq        %0, %%mm0\n\t"
653
        "movq        %2, %%mm4\n\t"
654
        "movq        %%mm0, %%mm2\n\t"
655
        "movq        %%mm1, %%mm3\n\t"
656
        "movq        %%mm4, %%mm5\n\t"
657
        "punpcklbw %%mm7, %%mm1\n\t"
658
        "punpckhbw %%mm7, %%mm3\n\t"
659
        "punpcklbw %%mm7, %%mm4\n\t"
660
        "punpckhbw %%mm7, %%mm5\n\t"
661
        "punpcklbw %%mm7, %%mm0\n\t"
662
        "punpckhbw %%mm7, %%mm2\n\t"
663
        "paddusw %%mm4, %%mm1\n\t"
664
        "paddusw %%mm5, %%mm3\n\t"
665
        "paddusw %%mm6, %%mm1\n\t"
666
        "paddusw %%mm6, %%mm3\n\t"
667
        "psrlw        $1, %%mm1\n\t"
668
        "psrlw        $1, %%mm3\n\t"
669
        "paddusw %%mm6, %%mm0\n\t"
670
        "paddusw %%mm6, %%mm2\n\t"
671
        "paddusw %%mm1, %%mm0\n\t"
672
        "paddusw %%mm3, %%mm2\n\t"
673
        "psrlw        $1, %%mm0\n\t"
674
        "psrlw        $1, %%mm2\n\t"
675
        "packuswb  %%mm2, %%mm0\n\t"
676
        "movq        %%mm0, %0\n\t"
677
        :"+m"(*p)
678
        :"m"(*pix), "m"(*(pix+line_size))
679
        :"memory");
680
   pix += line_size;
681
   p +=   line_size ;
682
  } while(--h);
683
}
684

    
685
static void   avg_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
686
{
687
  UINT8  *p;
688
  const UINT8 *pix;
689
  p = block;
690
  pix = pixels;
691
  MOVQ_ZERO(mm7);
692
  // this doesn't seem to be used offten - so
693
  // the inside usage of mm_wone is not optimized
694
  MOVQ_WTWO(mm6);
695
  do {
696
    __asm __volatile(
697
        "movq        %1, %%mm0\n\t"
698
        "movq        %2, %%mm1\n\t"
699
        "movq        1%1, %%mm4\n\t"
700
        "movq        1%2, %%mm5\n\t"
701
        "movq        %%mm0, %%mm2\n\t"
702
        "movq        %%mm1, %%mm3\n\t"
703
        "punpcklbw %%mm7, %%mm0\n\t"
704
        "punpcklbw %%mm7, %%mm1\n\t"
705
        "punpckhbw %%mm7, %%mm2\n\t"
706
        "punpckhbw %%mm7, %%mm3\n\t"
707
        "paddusw %%mm1, %%mm0\n\t"
708
        "paddusw %%mm3, %%mm2\n\t"
709
        "movq        %%mm4, %%mm1\n\t"
710
        "movq        %%mm5, %%mm3\n\t"
711
        "punpcklbw %%mm7, %%mm4\n\t"
712
        "punpcklbw %%mm7, %%mm5\n\t"
713
        "punpckhbw %%mm7, %%mm1\n\t"
714
        "punpckhbw %%mm7, %%mm3\n\t"
715
        "paddusw %%mm5, %%mm4\n\t"
716
        "paddusw %%mm3, %%mm1\n\t"
717
        "paddusw %%mm6, %%mm4\n\t"
718
        "paddusw %%mm6, %%mm1\n\t"
719
        "paddusw %%mm4, %%mm0\n\t"
720
        "paddusw %%mm1, %%mm2\n\t"
721
        "movq        %3, %%mm5\n\t"
722
        "psrlw        $2, %%mm0\n\t"
723
        "movq        %0, %%mm1\n\t"
724
        "psrlw        $2, %%mm2\n\t"
725
        "movq        %%mm1, %%mm3\n\t"
726
        "punpcklbw %%mm7, %%mm1\n\t"
727
        "punpckhbw %%mm7, %%mm3\n\t"
728
        "paddusw %%mm1, %%mm0\n\t"
729
        "paddusw %%mm3, %%mm2\n\t"
730
        "paddusw %%mm5, %%mm0\n\t"
731
        "paddusw %%mm5, %%mm2\n\t"
732
        "psrlw        $1, %%mm0\n\t"
733
        "psrlw        $1, %%mm2\n\t"
734
        "packuswb  %%mm2, %%mm0\n\t"
735
        "movq        %%mm0, %0\n\t"
736
        :"+m"(*p)
737
        :"m"(*pix),
738
         "m"(*(pix+line_size)), "m"(mm_wone)
739
        :"memory");
740
   pix += line_size;
741
   p +=   line_size ;
742
  } while(--h);
743
}
744

    
745
static void avg_no_rnd_pixels_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
746
{
747
  UINT8  *p;
748
  const UINT8 *pix;
749
  p = block;
750
  pix = pixels;
751
  MOVQ_ZERO(mm7);
752
  do {
753
    __asm __volatile(
754
        "movq        %1, %%mm0\n\t"
755
        "movq        %0, %%mm1\n\t"
756
        "movq        %%mm0, %%mm2\n\t"
757
        "movq        %%mm1, %%mm3\n\t"
758
        "punpcklbw %%mm7, %%mm0\n\t"
759
        "punpcklbw %%mm7, %%mm1\n\t"
760
        "punpckhbw %%mm7, %%mm2\n\t"
761
        "punpckhbw %%mm7, %%mm3\n\t"
762
        "paddusw %%mm1, %%mm0\n\t"
763
        "paddusw %%mm3, %%mm2\n\t"
764
        "psrlw        $1, %%mm0\n\t"
765
        "psrlw        $1, %%mm2\n\t"
766
        "packuswb  %%mm2, %%mm0\n\t"
767
        "movq        %%mm0, %0\n\t"
768
        :"+m"(*p)
769
        :"m"(*pix)
770
        :"memory");
771
   pix += line_size;
772
   p +=   line_size ;
773
  } while (--h);
774
}
775

    
776
static void   avg_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
777
{
778
  UINT8  *p;
779
  const UINT8 *pix;
780
  p = block;
781
  pix = pixels;
782
  MOVQ_ZERO(mm7);
783
  do {
784
    __asm __volatile(
785
        "movq        %1, %%mm0\n\t"
786
        "movq        1%1, %%mm1\n\t"
787
        "movq        %0, %%mm4\n\t"
788
        "movq        %%mm0, %%mm2\n\t"
789
        "movq        %%mm1, %%mm3\n\t"
790
        "movq        %%mm4, %%mm5\n\t"
791
        "punpcklbw %%mm7, %%mm0\n\t"
792
        "punpcklbw %%mm7, %%mm1\n\t"
793
        "punpckhbw %%mm7, %%mm2\n\t"
794
        "punpckhbw %%mm7, %%mm3\n\t"
795
        "punpcklbw %%mm7, %%mm4\n\t"
796
        "punpckhbw %%mm7, %%mm5\n\t"
797
        "paddusw %%mm1, %%mm0\n\t"
798
        "paddusw %%mm3, %%mm2\n\t"
799
        "psrlw        $1, %%mm0\n\t"
800
        "psrlw        $1, %%mm2\n\t"
801
        "paddusw %%mm4, %%mm0\n\t"
802
        "paddusw %%mm5, %%mm2\n\t"
803
        "psrlw        $1, %%mm0\n\t"
804
        "psrlw        $1, %%mm2\n\t"
805
        "packuswb  %%mm2, %%mm0\n\t"
806
        "movq        %%mm0, %0\n\t"
807
        :"+m"(*p)
808
        :"m"(*pix)
809
        :"memory");
810
   pix += line_size;
811
   p +=   line_size;
812
 } while (--h);
813
}
814

    
815
static void   avg_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
816
{
817
  UINT8  *p;
818
  const UINT8 *pix;
819
  p = block;
820
  pix = pixels;
821
  MOVQ_ZERO(mm7);
822
  do {
823
    __asm __volatile(
824
        "movq        %1, %%mm0\n\t"
825
        "movq        %2, %%mm1\n\t"
826
        "movq        %0, %%mm4\n\t"
827
        "movq        %%mm0, %%mm2\n\t"
828
        "movq        %%mm1, %%mm3\n\t"
829
        "movq        %%mm4, %%mm5\n\t"
830
        "punpcklbw %%mm7, %%mm0\n\t"
831
        "punpcklbw %%mm7, %%mm1\n\t"
832
        "punpckhbw %%mm7, %%mm2\n\t"
833
        "punpckhbw %%mm7, %%mm3\n\t"
834
        "punpcklbw %%mm7, %%mm4\n\t"
835
        "punpckhbw %%mm7, %%mm5\n\t"
836
        "paddusw %%mm1, %%mm0\n\t"
837
        "paddusw %%mm3, %%mm2\n\t"
838
        "psrlw        $1, %%mm0\n\t"
839
        "psrlw        $1, %%mm2\n\t"
840
        "paddusw %%mm4, %%mm0\n\t"
841
        "paddusw %%mm5, %%mm2\n\t"
842
        "psrlw        $1, %%mm0\n\t"
843
        "psrlw        $1, %%mm2\n\t"
844
        "packuswb  %%mm2, %%mm0\n\t"
845
        "movq        %%mm0, %0\n\t"
846
        :"+m"(*p)
847
        :"m"(*pix), "m"(*(pix+line_size))
848
        :"memory");
849
   pix += line_size;
850
   p +=   line_size ;
851
  } while(--h);
852
}
853

    
854
static void   avg_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
855
{
856
  UINT8  *p;
857
  const UINT8 *pix;
858
  p = block;
859
  pix = pixels;
860
  MOVQ_ZERO(mm7);
861
  MOVQ_WONE(mm6);
862
  JUMPALIGN();
863
  do {
864
    __asm __volatile(
865
        "movq        %1, %%mm0\n\t"
866
        "movq        %2, %%mm1\n\t"
867
        "movq        1%1, %%mm4\n\t"
868
        "movq        1%2, %%mm5\n\t"
869
        "movq        %%mm0, %%mm2\n\t"
870
        "movq        %%mm1, %%mm3\n\t"
871
        "punpcklbw %%mm7, %%mm0\n\t"
872
        "punpcklbw %%mm7, %%mm1\n\t"
873
        "punpckhbw %%mm7, %%mm2\n\t"
874
        "punpckhbw %%mm7, %%mm3\n\t"
875
        "paddusw %%mm1, %%mm0\n\t"
876
        "paddusw %%mm3, %%mm2\n\t"
877
        "movq        %%mm4, %%mm1\n\t"
878
        "movq        %%mm5, %%mm3\n\t"
879
        "punpcklbw %%mm7, %%mm4\n\t"
880
        "punpcklbw %%mm7, %%mm5\n\t"
881
        "punpckhbw %%mm7, %%mm1\n\t"
882
        "punpckhbw %%mm7, %%mm3\n\t"
883
        "paddusw %%mm5, %%mm4\n\t"
884
        "paddusw %%mm3, %%mm1\n\t"
885
        "paddusw %%mm6, %%mm4\n\t"
886
        "paddusw %%mm6, %%mm1\n\t"
887
        "paddusw %%mm4, %%mm0\n\t"
888
        "paddusw %%mm1, %%mm2\n\t"
889
        "movq        %0, %%mm1\n\t"
890
        "psrlw        $2, %%mm0\n\t"
891
        "movq        %%mm1, %%mm3\n\t"
892
        "psrlw        $2, %%mm2\n\t"
893
        "punpcklbw %%mm7, %%mm1\n\t"
894
        "punpckhbw %%mm7, %%mm3\n\t"
895
        "paddusw %%mm1, %%mm0\n\t"
896
        "paddusw %%mm3, %%mm2\n\t"
897
        "psrlw        $1, %%mm0\n\t"
898
        "psrlw        $1, %%mm2\n\t"
899
        "packuswb  %%mm2, %%mm0\n\t"
900
        "movq        %%mm0, %0\n\t"
901
        :"+m"(*p)
902
        :"m"(*pix),
903
         "m"(*(pix+line_size))
904
        :"memory");
905
   pix += line_size;
906
   p += line_size;
907
  } while(--h);
908
}
909

    
910
static void clear_blocks_mmx(DCTELEM *blocks)
911
{
912
        asm volatile(
913
                "pxor %%mm7, %%mm7                \n\t"
914
                "movl $-128*6, %%eax                \n\t"
915
                "1:                                \n\t"
916
                "movq %%mm7, (%0, %%eax)        \n\t"
917
                "movq %%mm7, 8(%0, %%eax)        \n\t"
918
                "movq %%mm7, 16(%0, %%eax)        \n\t"
919
                "movq %%mm7, 24(%0, %%eax)        \n\t"
920
                "addl $32, %%eax                \n\t"
921
                " js 1b                                \n\t"
922
                : : "r" (((int)blocks)+128*6)
923
                : "%eax"
924
        );
925
}
926

    
927
#if 0
928
static void just_return() { return; }
929
#endif
930

    
931
#ifndef TESTCPU_MAIN
932
void dsputil_init_mmx(void)
933
{
934
    mm_flags = mm_support();
935
#if 1
936
    printf("libavcodec: CPU flags:");
937
    if (mm_flags & MM_MMX)
938
        printf(" mmx");
939
    if (mm_flags & MM_MMXEXT)
940
        printf(" mmxext");
941
    if (mm_flags & MM_3DNOW)
942
        printf(" 3dnow");
943
    if (mm_flags & MM_SSE)
944
        printf(" sse");
945
    if (mm_flags & MM_SSE2)
946
        printf(" sse2");
947
    printf("\n");
948
#endif
949

    
950
    if (mm_flags & MM_MMX) {
951
        get_pixels = get_pixels_mmx;
952
        diff_pixels = diff_pixels_mmx;
953
        put_pixels_clamped = put_pixels_clamped_mmx;
954
        add_pixels_clamped = add_pixels_clamped_mmx;
955
        clear_blocks= clear_blocks_mmx;
956

    
957
        pix_abs16x16     = pix_abs16x16_mmx;
958
        pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
959
        pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
960
        pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
961
        pix_abs8x8    = pix_abs8x8_mmx;
962
        pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
963
        pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
964
        pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
965
        av_fdct = fdct_mmx;
966

    
967
        put_pixels_tab[0] = put_pixels_mmx;
968
        put_pixels_tab[1] = put_pixels_x2_mmx;
969
        put_pixels_tab[2] = put_pixels_y2_mmx;
970
        put_pixels_tab[3] = put_pixels_xy2_mmx;
971

    
972
        put_no_rnd_pixels_tab[0] = put_pixels_mmx;
973
        put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
974
        put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
975
        put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
976

    
977
        avg_pixels_tab[0] = avg_pixels_mmx;
978
        avg_pixels_tab[1] = avg_pixels_x2_mmx;
979
        avg_pixels_tab[2] = avg_pixels_y2_mmx;
980
        avg_pixels_tab[3] = avg_pixels_xy2_mmx;
981

    
982
        avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
983
        avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
984
        avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
985
        avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
986

    
987
        if (mm_flags & MM_MMXEXT) {
988
            pix_abs16x16    = pix_abs16x16_mmx2;
989
            pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
990
            pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
991
            pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
992

    
993
            pix_abs8x8    = pix_abs8x8_mmx2;
994
            pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
995
            pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
996
            pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
997

    
998
            put_pixels_tab[1] = put_pixels_x2_mmx2;
999
            put_pixels_tab[2] = put_pixels_y2_mmx2;
1000
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
1001
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
1002

    
1003
            avg_pixels_tab[0] = avg_pixels_mmx2;
1004
            avg_pixels_tab[1] = avg_pixels_x2_mmx2;
1005
            avg_pixels_tab[2] = avg_pixels_y2_mmx2;
1006
            avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
1007
        } else if (mm_flags & MM_3DNOW) {
1008
            put_pixels_tab[1] = put_pixels_x2_3dnow;
1009
            put_pixels_tab[2] = put_pixels_y2_3dnow;
1010
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
1011
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
1012

    
1013
            avg_pixels_tab[0] = avg_pixels_3dnow;
1014
            avg_pixels_tab[1] = avg_pixels_x2_3dnow;
1015
            avg_pixels_tab[2] = avg_pixels_y2_3dnow;
1016
            avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
1017
        }
1018

    
1019
        /* idct */
1020
        if (mm_flags & MM_MMXEXT) {
1021
            ff_idct = ff_mmxext_idct;
1022
        } else {
1023
            ff_idct = ff_mmx_idct;
1024
        }
1025
#ifdef SIMPLE_IDCT
1026
//        ff_idct = simple_idct;
1027
        ff_idct = simple_idct_mmx;
1028
#endif
1029
    }
1030

    
1031
#if 0
1032
    // for speed testing
1033
    get_pixels = just_return;
1034
    put_pixels_clamped = just_return;
1035
    add_pixels_clamped = just_return;
1036

1037
    pix_abs16x16 = just_return;
1038
    pix_abs16x16_x2 = just_return;
1039
    pix_abs16x16_y2 = just_return;
1040
    pix_abs16x16_xy2 = just_return;
1041

1042
    put_pixels_tab[0] = just_return;
1043
    put_pixels_tab[1] = just_return;
1044
    put_pixels_tab[2] = just_return;
1045
    put_pixels_tab[3] = just_return;
1046

1047
    put_no_rnd_pixels_tab[0] = just_return;
1048
    put_no_rnd_pixels_tab[1] = just_return;
1049
    put_no_rnd_pixels_tab[2] = just_return;
1050
    put_no_rnd_pixels_tab[3] = just_return;
1051

1052
    avg_pixels_tab[0] = just_return;
1053
    avg_pixels_tab[1] = just_return;
1054
    avg_pixels_tab[2] = just_return;
1055
    avg_pixels_tab[3] = just_return;
1056

1057
    avg_no_rnd_pixels_tab[0] = just_return;
1058
    avg_no_rnd_pixels_tab[1] = just_return;
1059
    avg_no_rnd_pixels_tab[2] = just_return;
1060
    avg_no_rnd_pixels_tab[3] = just_return;
1061

1062
    //av_fdct = just_return;
1063
    //ff_idct = just_return;
1064
#endif
1065
}
1066

    
1067
/* remove any non bit exact operation (testing purpose). NOTE that
1068
   this function should be kept as small as possible because it is
1069
   always difficult to test automatically non bit exact cases. */
1070
void dsputil_set_bit_exact_mmx(void)
1071
{
1072
    if (mm_flags & MM_MMX) {
1073
        if (mm_flags & MM_MMXEXT) {
1074
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1075
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1076
            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1077
        } else if (mm_flags & MM_3DNOW) {
1078
            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1079
            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1080
            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1081
        }
1082
    }
1083
}
1084

    
1085
#endif