Revision a6493a8f

View differences:

MAINTAINERS
298 298
Alpha                                   Mans Rullgard, Falk Hueffner
299 299
ARM                                     Mans Rullgard
300 300
BeOS                                    Francois Revol
301
i386                                    Michael Niedermayer
302 301
Mac OS X / PowerPC                      Romain Dolbeau, Guillaume Poirier
303 302
Amiga / PowerPC                         Colin Ward
304 303
Linux / PowerPC                         Luca Barbato
......
306 305
Windows Cygwin                          Victor Paesa
307 306
ADI/Blackfin DSP                        Marc Hoffman
308 307
Sparc                                   Roman Shaposhnik
308
x86                                     Michael Niedermayer
README
17 17
  License, see the file COPYING.GPL for details. Their compilation and use
18 18
  in FFmpeg is optional.
19 19

  
20
* The file libavcodec/i386/idct_mmx.c is distributed under the GNU General
20
* The file libavcodec/x86/idct_mmx.c is distributed under the GNU General
21 21
  Public License. It is strictly an optimization and its use is optional.
22 22

  
23 23
* The file libavcodec/ac3dec.c is distributed under the GNU General Public
common.mak
91 91
$(SUBDIR)%-test.o: $(SUBDIR)%-test.c
92 92
	$(CC) $(CFLAGS) -DTEST -c -o $$@ $$^
93 93

  
94
$(SUBDIR)i386/%.o: $(SUBDIR)i386/%.asm
94
$(SUBDIR)x86/%.o: $(SUBDIR)x86/%.asm
95 95
	$(YASM) $(YASMFLAGS) -I $$(<D)/ -o $$@ $$<
96 96

  
97
$(SUBDIR)i386/%.d: $(SUBDIR)i386/%.asm
97
$(SUBDIR)x86/%.d: $(SUBDIR)x86/%.asm
98 98
	$(YASM) $(YASMFLAGS) -I $$(<D)/ -M -o $$(@:%.d=%.o) $$< > $$@
99 99

  
100 100
clean::
configure
2359 2359
        libavcodec/alpha  \
2360 2360
        libavcodec/arm    \
2361 2361
        libavcodec/bfin   \
2362
        libavcodec/i386   \
2363 2362
        libavcodec/mlib   \
2364 2363
        libavcodec/ppc    \
2365 2364
        libavcodec/sh4    \
2366 2365
        libavcodec/sparc  \
2366
        libavcodec/x86    \
2367 2367
        libavdevice       \
2368 2368
        libavfilter       \
2369 2369
        libavformat       \
doc/optimization.txt
4 4
What to optimize:
5 5
-----------------
6 6
If you plan to do non-x86 architecture specific optimizations (SIMD normally),
7
then take a look in the i386/ directory, as most important functions are
7
then take a look in the x86/ directory, as most important functions are
8 8
already optimized for MMX.
9 9

  
10 10
If you want to do x86 optimizations then you can either try to finetune the
11
stuff in the i386 directory or find some other functions in the C source to
11
stuff in the x86 directory or find some other functions in the C source to
12 12
optimize, but there aren't many left.
13 13

  
14 14

  
......
20 20
revisions of the interesting files (for a web frontend try ViewVC at
21 21
http://svn.mplayerhq.hu/ffmpeg/trunk/).
22 22
Alternatively, look into the other architecture-specific versions in
23
the i386/, ppc/, alpha/ subdirectories. Even if you don't exactly
23
the x86/, ppc/, alpha/ subdirectories. Even if you don't exactly
24 24
comprehend the instructions, it could help understanding the functions
25 25
and how they can be optimized.
26 26

  
libavcodec/Makefile
392 392
OBJS-$(HAVE_W32THREADS)                += w32thread.o
393 393

  
394 394
# processor-specific code
395
MMX-OBJS-$(CONFIG_CAVS_DECODER)        += i386/cavsdsp_mmx.o
396
MMX-OBJS-$(CONFIG_ENCODERS)            += i386/dsputilenc_mmx.o
397
MMX-OBJS-$(CONFIG_FLAC_ENCODER)        += i386/flacdsp_mmx.o
398
MMX-OBJS-$(CONFIG_GPL)                 += i386/idct_mmx.o
399
MMX-OBJS-$(CONFIG_SNOW_DECODER)        += i386/snowdsp_mmx.o
400
MMX-OBJS-$(CONFIG_THEORA_DECODER)      += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
401
MMX-OBJS-$(CONFIG_VC1_DECODER)         += i386/vc1dsp_mmx.o
402
MMX-OBJS-$(CONFIG_VP3_DECODER)         += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
403
MMX-OBJS-$(CONFIG_VP5_DECODER)         += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
404
MMX-OBJS-$(CONFIG_VP6_DECODER)         += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
405
MMX-OBJS-$(CONFIG_VP6A_DECODER)        += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
406
MMX-OBJS-$(CONFIG_VP6F_DECODER)        += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
407
MMX-OBJS-$(CONFIG_WMV3_DECODER)        += i386/vc1dsp_mmx.o
408
MMX-OBJS-$(HAVE_YASM)                  += i386/dsputil_yasm.o \
409
                                          i386/h264_deblock_sse2.o
410

  
411
OBJS-$(HAVE_MMX)                       += i386/cpuid.o                  \
412
                                          i386/dnxhd_mmx.o              \
413
                                          i386/dsputil_mmx.o            \
414
                                          i386/fdct_mmx.o               \
415
                                          i386/idct_mmx_xvid.o          \
416
                                          i386/idct_sse2_xvid.o         \
417
                                          i386/motion_est_mmx.o         \
418
                                          i386/mpegvideo_mmx.o          \
419
                                          i386/simple_idct_mmx.o        \
395
MMX-OBJS-$(CONFIG_CAVS_DECODER)        += x86/cavsdsp_mmx.o
396
MMX-OBJS-$(CONFIG_ENCODERS)            += x86/dsputilenc_mmx.o
397
MMX-OBJS-$(CONFIG_FLAC_ENCODER)        += x86/flacdsp_mmx.o
398
MMX-OBJS-$(CONFIG_GPL)                 += x86/idct_mmx.o
399
MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp_mmx.o
400
MMX-OBJS-$(CONFIG_THEORA_DECODER)      += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
401
MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
402
MMX-OBJS-$(CONFIG_VP3_DECODER)         += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
403
MMX-OBJS-$(CONFIG_VP5_DECODER)         += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
404
MMX-OBJS-$(CONFIG_VP6_DECODER)         += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
405
MMX-OBJS-$(CONFIG_VP6A_DECODER)        += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
406
MMX-OBJS-$(CONFIG_VP6F_DECODER)        += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
407
MMX-OBJS-$(CONFIG_WMV3_DECODER)        += x86/vc1dsp_mmx.o
408
MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
409
                                          x86/h264_deblock_sse2.o       \
410

  
411
OBJS-$(HAVE_MMX)                       += x86/cpuid.o                   \
412
                                          x86/dnxhd_mmx.o               \
413
                                          x86/dsputil_mmx.o             \
414
                                          x86/fdct_mmx.o                \
415
                                          x86/idct_mmx_xvid.o           \
416
                                          x86/idct_sse2_xvid.o          \
417
                                          x86/motion_est_mmx.o          \
418
                                          x86/mpegvideo_mmx.o           \
419
                                          x86/simple_idct_mmx.o         \
420 420
                                          $(MMX-OBJS-yes)
421 421

  
422
OBJS-$(CONFIG_FFT_MMX)                 += i386/fft_3dn.o                \
423
                                          i386/fft_3dn2.o               \
424
                                          i386/fft_mmx.o                \
425
                                          i386/fft_sse.o                \
422
OBJS-$(CONFIG_FFT_MMX)                 += x86/fft_3dn.o                 \
423
                                          x86/fft_3dn2.o                \
424
                                          x86/fft_mmx.o                 \
425
                                          x86/fft_sse.o                 \
426 426

  
427 427
OBJS-$(ARCH_ALPHA)                     += alpha/dsputil_alpha.o         \
428 428
                                          alpha/dsputil_alpha_asm.o     \
......
498 498

  
499 499
TESTS = $(addsuffix -test$(EXESUF), cabac dct eval fft h264 rangecoder snow)
500 500
TESTS-$(CONFIG_OLDSCALER) += imgresample-test$(EXESUF)
501
TESTS-$(ARCH_X86) += i386/cpuid-test$(EXESUF) motion-test$(EXESUF)
501
TESTS-$(ARCH_X86) += x86/cpuid-test$(EXESUF) motion-test$(EXESUF)
502 502

  
503 503
CLEANFILES = apiexample$(EXESUF)
504
DIRS = alpha arm bfin i386 mlib ppc ps2 sh4 sparc
504
DIRS = alpha arm bfin mlib ppc ps2 sh4 sparc x86
505 505

  
506 506
include $(SUBDIR)../subdir.mak
507 507

  
libavcodec/dct-test.c
38 38
#include "aandcttab.h"
39 39
#include "faandct.h"
40 40
#include "faanidct.h"
41
#include "i386/idct_xvid.h"
41
#include "x86/idct_xvid.h"
42 42

  
43 43
#undef printf
44 44
#undef random
libavcodec/h264.c
36 36

  
37 37
#include "cabac.h"
38 38
#ifdef ARCH_X86
39
#include "i386/h264_i386.h"
39
#include "x86/h264_i386.h"
40 40
#endif
41 41

  
42 42
//#undef NDEBUG
libavcodec/i386/cavsdsp_mmx.c
1
/*
2
 * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
3
 * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
4
 *
5
 * MMX-optimized DSP functions, based on H.264 optimizations by
6
 * Michael Niedermayer and Loren Merritt
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

  
25
#include "libavutil/common.h"
26
#include "libavutil/x86_cpu.h"
27
#include "libavcodec/dsputil.h"
28
#include "dsputil_mmx.h"
29

  
30
/*****************************************************************************
31
 *
32
 * inverse transform
33
 *
34
 ****************************************************************************/
35

  
36
static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
37
{
38
    __asm__ volatile(
39
        "movq 112(%0), %%mm4  \n\t" /* mm4 = src7 */
40
        "movq  16(%0), %%mm5  \n\t" /* mm5 = src1 */
41
        "movq  80(%0), %%mm2  \n\t" /* mm2 = src5 */
42
        "movq  48(%0), %%mm7  \n\t" /* mm7 = src3 */
43
        "movq   %%mm4, %%mm0  \n\t"
44
        "movq   %%mm5, %%mm3  \n\t"
45
        "movq   %%mm2, %%mm6  \n\t"
46
        "movq   %%mm7, %%mm1  \n\t"
47

  
48
        "paddw  %%mm4, %%mm4  \n\t" /* mm4 = 2*src7 */
49
        "paddw  %%mm3, %%mm3  \n\t" /* mm3 = 2*src1 */
50
        "paddw  %%mm6, %%mm6  \n\t" /* mm6 = 2*src5 */
51
        "paddw  %%mm1, %%mm1  \n\t" /* mm1 = 2*src3 */
52
        "paddw  %%mm4, %%mm0  \n\t" /* mm0 = 3*src7 */
53
        "paddw  %%mm3, %%mm5  \n\t" /* mm5 = 3*src1 */
54
        "paddw  %%mm6, %%mm2  \n\t" /* mm2 = 3*src5 */
55
        "paddw  %%mm1, %%mm7  \n\t" /* mm7 = 3*src3 */
56
        "psubw  %%mm4, %%mm5  \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
57
        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
58
        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
59
        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
60

  
61
        "movq   %%mm5, %%mm4  \n\t"
62
        "movq   %%mm7, %%mm6  \n\t"
63
        "movq   %%mm3, %%mm0  \n\t"
64
        "movq   %%mm1, %%mm2  \n\t"
65
        SUMSUB_BA( %%mm7, %%mm5 )   /* mm7 = a0 + a1  mm5 = a0 - a1 */
66
        "paddw  %%mm3, %%mm7  \n\t" /* mm7 = a0 + a1 + a3 */
67
        "paddw  %%mm1, %%mm5  \n\t" /* mm5 = a0 - a1 + a2 */
68
        "paddw  %%mm7, %%mm7  \n\t"
69
        "paddw  %%mm5, %%mm5  \n\t"
70
        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = b4 */
71
        "paddw  %%mm4, %%mm5  \n\t" /* mm5 = b5 */
72

  
73
        SUMSUB_BA( %%mm1, %%mm3 )   /* mm1 = a3 + a2  mm3 = a3 - a2 */
74
        "psubw  %%mm1, %%mm4  \n\t" /* mm4 = a0 - a2 - a3 */
75
        "movq   %%mm4, %%mm1  \n\t" /* mm1 = a0 - a2 - a3 */
76
        "psubw  %%mm6, %%mm3  \n\t" /* mm3 = a3 - a2 - a1 */
77
        "paddw  %%mm1, %%mm1  \n\t"
78
        "paddw  %%mm3, %%mm3  \n\t"
79
        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = b7 */
80
        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = b6 */
81

  
82
        "movq  32(%0), %%mm2  \n\t" /* mm2 = src2 */
83
        "movq  96(%0), %%mm6  \n\t" /* mm6 = src6 */
84
        "movq   %%mm2, %%mm4  \n\t"
85
        "movq   %%mm6, %%mm0  \n\t"
86
        "psllw  $2,    %%mm4  \n\t" /* mm4 = 4*src2 */
87
        "psllw  $2,    %%mm6  \n\t" /* mm6 = 4*src6 */
88
        "paddw  %%mm4, %%mm2  \n\t" /* mm2 = 5*src2 */
89
        "paddw  %%mm6, %%mm0  \n\t" /* mm0 = 5*src6 */
90
        "paddw  %%mm2, %%mm2  \n\t"
91
        "paddw  %%mm0, %%mm0  \n\t"
92
        "psubw  %%mm0, %%mm4  \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
93
        "paddw  %%mm2, %%mm6  \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
94

  
95
        "movq    (%0), %%mm2  \n\t" /* mm2 = src0 */
96
        "movq  64(%0), %%mm0  \n\t" /* mm0 = src4 */
97
        SUMSUB_BA( %%mm0, %%mm2 )   /* mm0 = src0+src4  mm2 = src0-src4 */
98
        "psllw  $3,    %%mm0  \n\t"
99
        "psllw  $3,    %%mm2  \n\t"
100
        "paddw  %1,    %%mm0  \n\t" /* add rounding bias */
101
        "paddw  %1,    %%mm2  \n\t" /* add rounding bias */
102

  
103
        SUMSUB_BA( %%mm6, %%mm0 )   /* mm6 = a4 + a6  mm0 = a4 - a6 */
104
        SUMSUB_BA( %%mm4, %%mm2 )   /* mm4 = a5 + a7  mm2 = a5 - a7 */
105
        SUMSUB_BA( %%mm7, %%mm6 )   /* mm7 = dst0  mm6 = dst7 */
106
        SUMSUB_BA( %%mm5, %%mm4 )   /* mm5 = dst1  mm4 = dst6 */
107
        SUMSUB_BA( %%mm3, %%mm2 )   /* mm3 = dst2  mm2 = dst5 */
108
        SUMSUB_BA( %%mm1, %%mm0 )   /* mm1 = dst3  mm0 = dst4 */
109
        :: "r"(block), "m"(bias)
110
    );
111
}
112

  
113
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
114
{
115
    int i;
116
    DECLARE_ALIGNED_8(int16_t, b2[64]);
117

  
118
    for(i=0; i<2; i++){
119
        DECLARE_ALIGNED_8(uint64_t, tmp);
120

  
121
        cavs_idct8_1d(block+4*i, ff_pw_4);
122

  
123
        __asm__ volatile(
124
            "psraw     $3, %%mm7  \n\t"
125
            "psraw     $3, %%mm6  \n\t"
126
            "psraw     $3, %%mm5  \n\t"
127
            "psraw     $3, %%mm4  \n\t"
128
            "psraw     $3, %%mm3  \n\t"
129
            "psraw     $3, %%mm2  \n\t"
130
            "psraw     $3, %%mm1  \n\t"
131
            "psraw     $3, %%mm0  \n\t"
132
            "movq   %%mm7,    %0   \n\t"
133
            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
134
            "movq   %%mm0,  8(%1)  \n\t"
135
            "movq   %%mm6, 24(%1)  \n\t"
136
            "movq   %%mm7, 40(%1)  \n\t"
137
            "movq   %%mm4, 56(%1)  \n\t"
138
            "movq    %0,    %%mm7  \n\t"
139
            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
140
            "movq   %%mm7,   (%1)  \n\t"
141
            "movq   %%mm1, 16(%1)  \n\t"
142
            "movq   %%mm0, 32(%1)  \n\t"
143
            "movq   %%mm3, 48(%1)  \n\t"
144
            : "=m"(tmp)
145
            : "r"(b2+32*i)
146
            : "memory"
147
        );
148
    }
149

  
150
    for(i=0; i<2; i++){
151
        cavs_idct8_1d(b2+4*i, ff_pw_64);
152

  
153
        __asm__ volatile(
154
            "psraw     $7, %%mm7  \n\t"
155
            "psraw     $7, %%mm6  \n\t"
156
            "psraw     $7, %%mm5  \n\t"
157
            "psraw     $7, %%mm4  \n\t"
158
            "psraw     $7, %%mm3  \n\t"
159
            "psraw     $7, %%mm2  \n\t"
160
            "psraw     $7, %%mm1  \n\t"
161
            "psraw     $7, %%mm0  \n\t"
162
            "movq   %%mm7,    (%0)  \n\t"
163
            "movq   %%mm5,  16(%0)  \n\t"
164
            "movq   %%mm3,  32(%0)  \n\t"
165
            "movq   %%mm1,  48(%0)  \n\t"
166
            "movq   %%mm0,  64(%0)  \n\t"
167
            "movq   %%mm2,  80(%0)  \n\t"
168
            "movq   %%mm4,  96(%0)  \n\t"
169
            "movq   %%mm6, 112(%0)  \n\t"
170
            :: "r"(b2+4*i)
171
            : "memory"
172
        );
173
    }
174

  
175
    add_pixels_clamped_mmx(b2, dst, stride);
176

  
177
    /* clear block */
178
    __asm__ volatile(
179
            "pxor %%mm7, %%mm7   \n\t"
180
            "movq %%mm7, (%0)    \n\t"
181
            "movq %%mm7, 8(%0)   \n\t"
182
            "movq %%mm7, 16(%0)  \n\t"
183
            "movq %%mm7, 24(%0)  \n\t"
184
            "movq %%mm7, 32(%0)  \n\t"
185
            "movq %%mm7, 40(%0)  \n\t"
186
            "movq %%mm7, 48(%0)  \n\t"
187
            "movq %%mm7, 56(%0)  \n\t"
188
            "movq %%mm7, 64(%0)  \n\t"
189
            "movq %%mm7, 72(%0)  \n\t"
190
            "movq %%mm7, 80(%0)  \n\t"
191
            "movq %%mm7, 88(%0)  \n\t"
192
            "movq %%mm7, 96(%0)  \n\t"
193
            "movq %%mm7, 104(%0) \n\t"
194
            "movq %%mm7, 112(%0) \n\t"
195
            "movq %%mm7, 120(%0) \n\t"
196
            :: "r" (block)
197
    );
198
}
199

  
200
/*****************************************************************************
201
 *
202
 * motion compensation
203
 *
204
 ****************************************************************************/
205

  
206
/* vertical filter [-1 -2 96 42 -7  0]  */
207
#define QPEL_CAVSV1(A,B,C,D,E,F,OP)      \
208
        "movd (%0), "#F"            \n\t"\
209
        "movq "#C", %%mm6           \n\t"\
210
        "pmullw %5, %%mm6           \n\t"\
211
        "movq "#D", %%mm7           \n\t"\
212
        "pmullw %6, %%mm7           \n\t"\
213
        "psllw $3, "#E"             \n\t"\
214
        "psubw "#E", %%mm6          \n\t"\
215
        "psraw $3, "#E"             \n\t"\
216
        "paddw %%mm7, %%mm6         \n\t"\
217
        "paddw "#E", %%mm6          \n\t"\
218
        "paddw "#B", "#B"           \n\t"\
219
        "pxor %%mm7, %%mm7          \n\t"\
220
        "add %2, %0                 \n\t"\
221
        "punpcklbw %%mm7, "#F"      \n\t"\
222
        "psubw "#B", %%mm6          \n\t"\
223
        "psraw $1, "#B"             \n\t"\
224
        "psubw "#A", %%mm6          \n\t"\
225
        "paddw %4, %%mm6            \n\t"\
226
        "psraw $7, %%mm6            \n\t"\
227
        "packuswb %%mm6, %%mm6      \n\t"\
228
        OP(%%mm6, (%1), A, d)            \
229
        "add %3, %1                 \n\t"
230

  
231
/* vertical filter [ 0 -1  5  5 -1  0]  */
232
#define QPEL_CAVSV2(A,B,C,D,E,F,OP)      \
233
        "movd (%0), "#F"            \n\t"\
234
        "movq "#C", %%mm6           \n\t"\
235
        "paddw "#D", %%mm6          \n\t"\
236
        "pmullw %5, %%mm6           \n\t"\
237
        "add %2, %0                 \n\t"\
238
        "punpcklbw %%mm7, "#F"      \n\t"\
239
        "psubw "#B", %%mm6          \n\t"\
240
        "psubw "#E", %%mm6          \n\t"\
241
        "paddw %4, %%mm6            \n\t"\
242
        "psraw $3, %%mm6            \n\t"\
243
        "packuswb %%mm6, %%mm6      \n\t"\
244
        OP(%%mm6, (%1), A, d)            \
245
        "add %3, %1                 \n\t"
246

  
247
/* vertical filter [ 0 -7 42 96 -2 -1]  */
248
#define QPEL_CAVSV3(A,B,C,D,E,F,OP)      \
249
        "movd (%0), "#F"            \n\t"\
250
        "movq "#C", %%mm6           \n\t"\
251
        "pmullw %6, %%mm6           \n\t"\
252
        "movq "#D", %%mm7           \n\t"\
253
        "pmullw %5, %%mm7           \n\t"\
254
        "psllw $3, "#B"             \n\t"\
255
        "psubw "#B", %%mm6          \n\t"\
256
        "psraw $3, "#B"             \n\t"\
257
        "paddw %%mm7, %%mm6         \n\t"\
258
        "paddw "#B", %%mm6          \n\t"\
259
        "paddw "#E", "#E"           \n\t"\
260
        "pxor %%mm7, %%mm7          \n\t"\
261
        "add %2, %0                 \n\t"\
262
        "punpcklbw %%mm7, "#F"      \n\t"\
263
        "psubw "#E", %%mm6          \n\t"\
264
        "psraw $1, "#E"             \n\t"\
265
        "psubw "#F", %%mm6          \n\t"\
266
        "paddw %4, %%mm6            \n\t"\
267
        "psraw $7, %%mm6            \n\t"\
268
        "packuswb %%mm6, %%mm6      \n\t"\
269
        OP(%%mm6, (%1), A, d)            \
270
        "add %3, %1                 \n\t"
271

  
272

  
273
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
274
    int w= 2;\
275
    src -= 2*srcStride;\
276
    \
277
    while(w--){\
278
      __asm__ volatile(\
279
        "pxor %%mm7, %%mm7          \n\t"\
280
        "movd (%0), %%mm0           \n\t"\
281
        "add %2, %0                 \n\t"\
282
        "movd (%0), %%mm1           \n\t"\
283
        "add %2, %0                 \n\t"\
284
        "movd (%0), %%mm2           \n\t"\
285
        "add %2, %0                 \n\t"\
286
        "movd (%0), %%mm3           \n\t"\
287
        "add %2, %0                 \n\t"\
288
        "movd (%0), %%mm4           \n\t"\
289
        "add %2, %0                 \n\t"\
290
        "punpcklbw %%mm7, %%mm0     \n\t"\
291
        "punpcklbw %%mm7, %%mm1     \n\t"\
292
        "punpcklbw %%mm7, %%mm2     \n\t"\
293
        "punpcklbw %%mm7, %%mm3     \n\t"\
294
        "punpcklbw %%mm7, %%mm4     \n\t"\
295
        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
296
        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
297
        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
298
        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
299
        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
300
        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
301
        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
302
        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
303
        \
304
        : "+a"(src), "+c"(dst)\
305
        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
306
        : "memory"\
307
     );\
308
     if(h==16){\
309
        __asm__ volatile(\
310
            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
311
            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
312
            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
313
            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
314
            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
315
            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
316
            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
317
            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
318
            \
319
           : "+a"(src), "+c"(dst)\
320
           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1), "m"(MUL2)\
321
           : "memory"\
322
        );\
323
     }\
324
     src += 4-(h+5)*srcStride;\
325
     dst += 4-h*dstStride;\
326
   }
327

  
328
#define QPEL_CAVS(OPNAME, OP, MMX)\
329
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
330
    int h=8;\
331
    __asm__ volatile(\
332
        "pxor %%mm7, %%mm7          \n\t"\
333
        "movq %5, %%mm6             \n\t"\
334
        "1:                         \n\t"\
335
        "movq    (%0), %%mm0        \n\t"\
336
        "movq   1(%0), %%mm2        \n\t"\
337
        "movq %%mm0, %%mm1          \n\t"\
338
        "movq %%mm2, %%mm3          \n\t"\
339
        "punpcklbw %%mm7, %%mm0     \n\t"\
340
        "punpckhbw %%mm7, %%mm1     \n\t"\
341
        "punpcklbw %%mm7, %%mm2     \n\t"\
342
        "punpckhbw %%mm7, %%mm3     \n\t"\
343
        "paddw %%mm2, %%mm0         \n\t"\
344
        "paddw %%mm3, %%mm1         \n\t"\
345
        "pmullw %%mm6, %%mm0        \n\t"\
346
        "pmullw %%mm6, %%mm1        \n\t"\
347
        "movq   -1(%0), %%mm2       \n\t"\
348
        "movq    2(%0), %%mm4       \n\t"\
349
        "movq %%mm2, %%mm3          \n\t"\
350
        "movq %%mm4, %%mm5          \n\t"\
351
        "punpcklbw %%mm7, %%mm2     \n\t"\
352
        "punpckhbw %%mm7, %%mm3     \n\t"\
353
        "punpcklbw %%mm7, %%mm4     \n\t"\
354
        "punpckhbw %%mm7, %%mm5     \n\t"\
355
        "paddw %%mm4, %%mm2         \n\t"\
356
        "paddw %%mm3, %%mm5         \n\t"\
357
        "psubw %%mm2, %%mm0         \n\t"\
358
        "psubw %%mm5, %%mm1         \n\t"\
359
        "movq %6, %%mm5             \n\t"\
360
        "paddw %%mm5, %%mm0         \n\t"\
361
        "paddw %%mm5, %%mm1         \n\t"\
362
        "psraw $3, %%mm0            \n\t"\
363
        "psraw $3, %%mm1            \n\t"\
364
        "packuswb %%mm1, %%mm0      \n\t"\
365
        OP(%%mm0, (%1),%%mm5, q)         \
366
        "add %3, %0                 \n\t"\
367
        "add %4, %1                 \n\t"\
368
        "decl %2                    \n\t"\
369
        " jnz 1b                    \n\t"\
370
        : "+a"(src), "+c"(dst), "+m"(h)\
371
        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
372
        : "memory"\
373
    );\
374
}\
375
\
376
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
377
  QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
378
}\
379
\
380
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
381
  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
382
}\
383
\
384
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
385
  QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
386
}\
387
\
388
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
389
    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
390
}\
391
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
392
    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
393
    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
394
}\
395
\
396
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
397
    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
398
}\
399
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
400
    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
401
    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
402
}\
403
\
404
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
405
    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
406
}\
407
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
408
    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
409
    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
410
}\
411
\
412
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
413
    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
414
    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
415
    src += 8*srcStride;\
416
    dst += 8*dstStride;\
417
    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
418
    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
419
}\
420

  
421
#define CAVS_MC(OPNAME, SIZE, MMX) \
422
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
423
    OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
424
}\
425
\
426
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
427
    OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
428
}\
429
\
430
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
431
    OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
432
}\
433
\
434
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
435
    OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
436
}\
437

  
438
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
439
#define AVG_3DNOW_OP(a,b,temp, size) \
440
"mov" #size " " #b ", " #temp "   \n\t"\
441
"pavgusb " #temp ", " #a "        \n\t"\
442
"mov" #size " " #a ", " #b "      \n\t"
443
#define AVG_MMX2_OP(a,b,temp, size) \
444
"mov" #size " " #b ", " #temp "   \n\t"\
445
"pavgb " #temp ", " #a "          \n\t"\
446
"mov" #size " " #a ", " #b "      \n\t"
447

  
448
QPEL_CAVS(put_,       PUT_OP, 3dnow)
449
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
450
QPEL_CAVS(put_,       PUT_OP, mmx2)
451
QPEL_CAVS(avg_,  AVG_MMX2_OP, mmx2)
452

  
453
CAVS_MC(put_, 8, 3dnow)
454
CAVS_MC(put_, 16,3dnow)
455
CAVS_MC(avg_, 8, 3dnow)
456
CAVS_MC(avg_, 16,3dnow)
457
CAVS_MC(put_, 8, mmx2)
458
CAVS_MC(put_, 16,mmx2)
459
CAVS_MC(avg_, 8, mmx2)
460
CAVS_MC(avg_, 16,mmx2)
461

  
462
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
463
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
464
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
465
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
466

  
467
void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) {
468
#define dspfunc(PFX, IDX, NUM) \
469
    c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
470
    c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \
471
    c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \
472
    c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \
473
    c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \
474

  
475
    dspfunc(put_cavs_qpel, 0, 16);
476
    dspfunc(put_cavs_qpel, 1, 8);
477
    dspfunc(avg_cavs_qpel, 0, 16);
478
    dspfunc(avg_cavs_qpel, 1, 8);
479
#undef dspfunc
480
    c->cavs_idct8_add = cavs_idct8_add_mmx;
481
}
482

  
483
void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) {
484
#define dspfunc(PFX, IDX, NUM) \
485
    c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
486
    c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
487
    c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \
488
    c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \
489
    c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \
490

  
491
    dspfunc(put_cavs_qpel, 0, 16);
492
    dspfunc(put_cavs_qpel, 1, 8);
493
    dspfunc(avg_cavs_qpel, 0, 16);
494
    dspfunc(avg_cavs_qpel, 1, 8);
495
#undef dspfunc
496
    c->cavs_idct8_add = cavs_idct8_add_mmx;
497
}
libavcodec/i386/cpuid.c
1
/*
2
 * CPU detection code, extracted from mmx.h
3
 * (c)1997-99 by H. Dietz and R. Fisher
4
 * Converted to C and improved by Fabrice Bellard.
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

  
23
#include <stdlib.h>
24
#include "libavutil/x86_cpu.h"
25
#include "libavcodec/dsputil.h"
26

  
27
#undef printf
28

  
29
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
30
#define cpuid(index,eax,ebx,ecx,edx)\
31
    __asm__ volatile\
32
        ("mov %%"REG_b", %%"REG_S"\n\t"\
33
         "cpuid\n\t"\
34
         "xchg %%"REG_b", %%"REG_S\
35
         : "=a" (eax), "=S" (ebx),\
36
           "=c" (ecx), "=d" (edx)\
37
         : "0" (index));
38

  
39
/* Function to test if multimedia instructions are supported...  */
40
int mm_support(void)
41
{
42
    int rval = 0;
43
    int eax, ebx, ecx, edx;
44
    int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
45
    x86_reg a, c;
46

  
47
#ifdef ARCH_X86_64
48
#define PUSHF "pushfq\n\t"
49
#define POPF "popfq\n\t"
50
#else
51
#define PUSHF "pushfl\n\t"
52
#define POPF "popfl\n\t"
53
#endif
54
    __asm__ volatile (
55
        /* See if CPUID instruction is supported ... */
56
        /* ... Get copies of EFLAGS into eax and ecx */
57
        PUSHF
58
        "pop %0\n\t"
59
        "mov %0, %1\n\t"
60

  
61
        /* ... Toggle the ID bit in one copy and store */
62
        /*     to the EFLAGS reg */
63
        "xor $0x200000, %0\n\t"
64
        "push %0\n\t"
65
        POPF
66

  
67
        /* ... Get the (hopefully modified) EFLAGS */
68
        PUSHF
69
        "pop %0\n\t"
70
        : "=a" (a), "=c" (c)
71
        :
72
        : "cc"
73
        );
74

  
75
    if (a == c)
76
        return 0; /* CPUID not supported */
77

  
78
    cpuid(0, max_std_level, ebx, ecx, edx);
79

  
80
    if(max_std_level >= 1){
81
        cpuid(1, eax, ebx, ecx, std_caps);
82
        if (std_caps & (1<<23))
83
            rval |= FF_MM_MMX;
84
        if (std_caps & (1<<25))
85
            rval |= FF_MM_MMXEXT
86
#if !defined(__GNUC__) || __GNUC__ > 2
87
                  | FF_MM_SSE;
88
        if (std_caps & (1<<26))
89
            rval |= FF_MM_SSE2;
90
        if (ecx & 1)
91
            rval |= FF_MM_SSE3;
92
        if (ecx & 0x00000200 )
93
            rval |= FF_MM_SSSE3
94
#endif
95
                  ;
96
    }
97

  
98
    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
99

  
100
    if(max_ext_level >= 0x80000001){
101
        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
102
        if (ext_caps & (1<<31))
103
            rval |= FF_MM_3DNOW;
104
        if (ext_caps & (1<<30))
105
            rval |= FF_MM_3DNOWEXT;
106
        if (ext_caps & (1<<23))
107
            rval |= FF_MM_MMX;
108
        if (ext_caps & (1<<22))
109
            rval |= FF_MM_MMXEXT;
110
    }
111

  
112
#if 0
113
    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s\n",
114
        (rval&FF_MM_MMX) ? "MMX ":"",
115
        (rval&FF_MM_MMXEXT) ? "MMX2 ":"",
116
        (rval&FF_MM_SSE) ? "SSE ":"",
117
        (rval&FF_MM_SSE2) ? "SSE2 ":"",
118
        (rval&FF_MM_SSE3) ? "SSE3 ":"",
119
        (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
120
        (rval&FF_MM_3DNOW) ? "3DNow ":"",
121
        (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
122
#endif
123
    return rval;
124
}
125

  
126
#ifdef TEST
127
int main ( void )
128
{
129
    int mm_flags;
130
    mm_flags = mm_support();
131
    printf("mm_support = 0x%08X\n",mm_flags);
132
    return 0;
133
}
134
#endif
libavcodec/i386/dnxhd_mmx.c
1
/*
2
 * VC3/DNxHD SIMD functions
3
 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
4
 *
5
 * VC-3 encoder funded by the British Broadcasting Corporation
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23

  
24
#include "libavutil/x86_cpu.h"
25
#include "libavcodec/dnxhdenc.h"
26

  
27
static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
28
{
29
    __asm__ volatile(
30
        "pxor %%xmm7,      %%xmm7       \n\t"
31
        "movq (%0),        %%xmm0       \n\t"
32
        "add  %2,          %0           \n\t"
33
        "movq (%0),        %%xmm1       \n\t"
34
        "movq (%0, %2),    %%xmm2       \n\t"
35
        "movq (%0, %2,2),  %%xmm3       \n\t"
36
        "punpcklbw %%xmm7, %%xmm0       \n\t"
37
        "punpcklbw %%xmm7, %%xmm1       \n\t"
38
        "punpcklbw %%xmm7, %%xmm2       \n\t"
39
        "punpcklbw %%xmm7, %%xmm3       \n\t"
40
        "movdqa %%xmm0,      (%1)       \n\t"
41
        "movdqa %%xmm1,    16(%1)       \n\t"
42
        "movdqa %%xmm2,    32(%1)       \n\t"
43
        "movdqa %%xmm3,    48(%1)       \n\t"
44
        "movdqa %%xmm3 ,   64(%1)       \n\t"
45
        "movdqa %%xmm2 ,   80(%1)       \n\t"
46
        "movdqa %%xmm1 ,   96(%1)       \n\t"
47
        "movdqa %%xmm0,   112(%1)       \n\t"
48
        : "+r" (pixels)
49
        : "r" (block), "r" ((x86_reg)line_size)
50
    );
51
}
52

  
53
void ff_dnxhd_init_mmx(DNXHDEncContext *ctx)
54
{
55
    if (mm_flags & FF_MM_SSE2) {
56
        ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
57
    }
58
}
libavcodec/i386/dsputil_h264_template_mmx.c
1
/*
2
 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
3
 *                    Loren Merritt
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

  
22
/**
23
 * MMX optimized version of (put|avg)_h264_chroma_mc8.
24
 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
25
 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
26
 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
27
 */
28
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
29
{
30
    const uint64_t *rnd_reg;
31
    DECLARE_ALIGNED_8(uint64_t, AA);
32
    DECLARE_ALIGNED_8(uint64_t, DD);
33
    int i;
34

  
35
    if(y==0 && x==0) {
36
        /* no filter needed */
37
        H264_CHROMA_MC8_MV0(dst, src, stride, h);
38
        return;
39
    }
40

  
41
    assert(x<8 && y<8 && x>=0 && y>=0);
42

  
43
    if(y==0 || x==0)
44
    {
45
        /* 1 dimensional filter only */
46
        const int dxy = x ? 1 : stride;
47

  
48
        rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3;
49

  
50
        __asm__ volatile(
51
            "movd %0, %%mm5\n\t"
52
            "movq %1, %%mm4\n\t"
53
            "movq %2, %%mm6\n\t"         /* mm6 = rnd */
54
            "punpcklwd %%mm5, %%mm5\n\t"
55
            "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
56
            "pxor %%mm7, %%mm7\n\t"
57
            "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
58
            :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg));
59

  
60
        for(i=0; i<h; i++) {
61
            __asm__ volatile(
62
                /* mm0 = src[0..7], mm1 = src[1..8] */
63
                "movq %0, %%mm0\n\t"
64
                "movq %1, %%mm2\n\t"
65
                :: "m"(src[0]), "m"(src[dxy]));
66

  
67
            __asm__ volatile(
68
                /* [mm0,mm1] = A * src[0..7] */
69
                /* [mm2,mm3] = B * src[1..8] */
70
                "movq %%mm0, %%mm1\n\t"
71
                "movq %%mm2, %%mm3\n\t"
72
                "punpcklbw %%mm7, %%mm0\n\t"
73
                "punpckhbw %%mm7, %%mm1\n\t"
74
                "punpcklbw %%mm7, %%mm2\n\t"
75
                "punpckhbw %%mm7, %%mm3\n\t"
76
                "pmullw %%mm4, %%mm0\n\t"
77
                "pmullw %%mm4, %%mm1\n\t"
78
                "pmullw %%mm5, %%mm2\n\t"
79
                "pmullw %%mm5, %%mm3\n\t"
80

  
81
                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */
82
                "paddw %%mm6, %%mm0\n\t"
83
                "paddw %%mm6, %%mm1\n\t"
84
                "paddw %%mm2, %%mm0\n\t"
85
                "paddw %%mm3, %%mm1\n\t"
86
                "psrlw $3, %%mm0\n\t"
87
                "psrlw $3, %%mm1\n\t"
88
                "packuswb %%mm1, %%mm0\n\t"
89
                H264_CHROMA_OP(%0, %%mm0)
90
                "movq %%mm0, %0\n\t"
91
                : "=m" (dst[0]));
92

  
93
            src += stride;
94
            dst += stride;
95
        }
96
        return;
97
    }
98

  
99
    /* general case, bilinear */
100
    rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a;
101
    __asm__ volatile("movd %2, %%mm4\n\t"
102
                 "movd %3, %%mm6\n\t"
103
                 "punpcklwd %%mm4, %%mm4\n\t"
104
                 "punpcklwd %%mm6, %%mm6\n\t"
105
                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
106
                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
107
                 "movq %%mm4, %%mm5\n\t"
108
                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
109
                 "psllw $3, %%mm5\n\t"
110
                 "psllw $3, %%mm6\n\t"
111
                 "movq %%mm5, %%mm7\n\t"
112
                 "paddw %%mm6, %%mm7\n\t"
113
                 "movq %%mm4, %1\n\t"         /* DD = x * y */
114
                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
115
                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
116
                 "paddw %4, %%mm4\n\t"
117
                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
118
                 "pxor %%mm7, %%mm7\n\t"
119
                 "movq %%mm4, %0\n\t"
120
                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
121

  
122
    __asm__ volatile(
123
        /* mm0 = src[0..7], mm1 = src[1..8] */
124
        "movq %0, %%mm0\n\t"
125
        "movq %1, %%mm1\n\t"
126
        : : "m" (src[0]), "m" (src[1]));
127

  
128
    for(i=0; i<h; i++) {
129
        src += stride;
130

  
131
        __asm__ volatile(
132
            /* mm2 = A * src[0..3] + B * src[1..4] */
133
            /* mm3 = A * src[4..7] + B * src[5..8] */
134
            "movq %%mm0, %%mm2\n\t"
135
            "movq %%mm1, %%mm3\n\t"
136
            "punpckhbw %%mm7, %%mm0\n\t"
137
            "punpcklbw %%mm7, %%mm1\n\t"
138
            "punpcklbw %%mm7, %%mm2\n\t"
139
            "punpckhbw %%mm7, %%mm3\n\t"
140
            "pmullw %0, %%mm0\n\t"
141
            "pmullw %0, %%mm2\n\t"
142
            "pmullw %%mm5, %%mm1\n\t"
143
            "pmullw %%mm5, %%mm3\n\t"
144
            "paddw %%mm1, %%mm2\n\t"
145
            "paddw %%mm0, %%mm3\n\t"
146
            : : "m" (AA));
147

  
148
        __asm__ volatile(
149
            /* [mm2,mm3] += C * src[0..7] */
150
            "movq %0, %%mm0\n\t"
151
            "movq %%mm0, %%mm1\n\t"
152
            "punpcklbw %%mm7, %%mm0\n\t"
153
            "punpckhbw %%mm7, %%mm1\n\t"
154
            "pmullw %%mm6, %%mm0\n\t"
155
            "pmullw %%mm6, %%mm1\n\t"
156
            "paddw %%mm0, %%mm2\n\t"
157
            "paddw %%mm1, %%mm3\n\t"
158
            : : "m" (src[0]));
159

  
160
        __asm__ volatile(
161
            /* [mm2,mm3] += D * src[1..8] */
162
            "movq %1, %%mm1\n\t"
163
            "movq %%mm1, %%mm0\n\t"
164
            "movq %%mm1, %%mm4\n\t"
165
            "punpcklbw %%mm7, %%mm0\n\t"
166
            "punpckhbw %%mm7, %%mm4\n\t"
167
            "pmullw %2, %%mm0\n\t"
168
            "pmullw %2, %%mm4\n\t"
169
            "paddw %%mm0, %%mm2\n\t"
170
            "paddw %%mm4, %%mm3\n\t"
171
            "movq %0, %%mm0\n\t"
172
            : : "m" (src[0]), "m" (src[1]), "m" (DD));
173

  
174
        __asm__ volatile(
175
            /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */
176
            "paddw %1, %%mm2\n\t"
177
            "paddw %1, %%mm3\n\t"
178
            "psrlw $6, %%mm2\n\t"
179
            "psrlw $6, %%mm3\n\t"
180
            "packuswb %%mm3, %%mm2\n\t"
181
            H264_CHROMA_OP(%0, %%mm2)
182
            "movq %%mm2, %0\n\t"
183
            : "=m" (dst[0]) : "m" (*rnd_reg));
184
        dst+= stride;
185
    }
186
}
187

  
188
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
189
{
190
    __asm__ volatile(
191
        "pxor   %%mm7, %%mm7        \n\t"
192
        "movd %5, %%mm2             \n\t"
193
        "movd %6, %%mm3             \n\t"
194
        "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
195
        "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
196
        "punpcklwd %%mm2, %%mm2     \n\t"
197
        "punpcklwd %%mm3, %%mm3     \n\t"
198
        "punpcklwd %%mm2, %%mm2     \n\t"
199
        "punpcklwd %%mm3, %%mm3     \n\t"
200
        "psubw %%mm2, %%mm4         \n\t"
201
        "psubw %%mm3, %%mm5         \n\t"
202

  
203
        "movd  (%1), %%mm0          \n\t"
204
        "movd 1(%1), %%mm6          \n\t"
205
        "add %3, %1                 \n\t"
206
        "punpcklbw %%mm7, %%mm0     \n\t"
207
        "punpcklbw %%mm7, %%mm6     \n\t"
208
        "pmullw %%mm4, %%mm0        \n\t"
209
        "pmullw %%mm2, %%mm6        \n\t"
210
        "paddw %%mm0, %%mm6         \n\t"
211

  
212
        "1:                         \n\t"
213
        "movd  (%1), %%mm0          \n\t"
214
        "movd 1(%1), %%mm1          \n\t"
215
        "add %3, %1                 \n\t"
216
        "punpcklbw %%mm7, %%mm0     \n\t"
217
        "punpcklbw %%mm7, %%mm1     \n\t"
218
        "pmullw %%mm4, %%mm0        \n\t"
219
        "pmullw %%mm2, %%mm1        \n\t"
220
        "paddw %%mm0, %%mm1         \n\t"
221
        "movq %%mm1, %%mm0          \n\t"
222
        "pmullw %%mm5, %%mm6        \n\t"
223
        "pmullw %%mm3, %%mm1        \n\t"
224
        "paddw %4, %%mm6            \n\t"
225
        "paddw %%mm6, %%mm1         \n\t"
226
        "psrlw $6, %%mm1            \n\t"
227
        "packuswb %%mm1, %%mm1      \n\t"
228
        H264_CHROMA_OP4((%0), %%mm1, %%mm6)
229
        "movd %%mm1, (%0)           \n\t"
230
        "add %3, %0                 \n\t"
231
        "movd  (%1), %%mm6          \n\t"
232
        "movd 1(%1), %%mm1          \n\t"
233
        "add %3, %1                 \n\t"
234
        "punpcklbw %%mm7, %%mm6     \n\t"
235
        "punpcklbw %%mm7, %%mm1     \n\t"
236
        "pmullw %%mm4, %%mm6        \n\t"
237
        "pmullw %%mm2, %%mm1        \n\t"
238
        "paddw %%mm6, %%mm1         \n\t"
239
        "movq %%mm1, %%mm6          \n\t"
240
        "pmullw %%mm5, %%mm0        \n\t"
241
        "pmullw %%mm3, %%mm1        \n\t"
242
        "paddw %4, %%mm0            \n\t"
243
        "paddw %%mm0, %%mm1         \n\t"
244
        "psrlw $6, %%mm1            \n\t"
245
        "packuswb %%mm1, %%mm1      \n\t"
246
        H264_CHROMA_OP4((%0), %%mm1, %%mm0)
247
        "movd %%mm1, (%0)           \n\t"
248
        "add %3, %0                 \n\t"
249
        "sub $2, %2                 \n\t"
250
        "jnz 1b                     \n\t"
251
        : "+r"(dst), "+r"(src), "+r"(h)
252
        : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y)
253
    );
254
}
255

  
256
#ifdef H264_CHROMA_MC2_TMPL
257
static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
258
{
259
    int tmp = ((1<<16)-1)*x + 8;
260
    int CD= tmp*y;
261
    int AB= (tmp<<3) - CD;
262
    __asm__ volatile(
263
        /* mm5 = {A,B,A,B} */
264
        /* mm6 = {C,D,C,D} */
265
        "movd %0, %%mm5\n\t"
266
        "movd %1, %%mm6\n\t"
267
        "punpckldq %%mm5, %%mm5\n\t"
268
        "punpckldq %%mm6, %%mm6\n\t"
269
        "pxor %%mm7, %%mm7\n\t"
270
        /* mm0 = src[0,1,1,2] */
271
        "movd %2, %%mm2\n\t"
272
        "punpcklbw %%mm7, %%mm2\n\t"
273
        "pshufw $0x94, %%mm2, %%mm2\n\t"
274
        :: "r"(AB), "r"(CD), "m"(src[0]));
275

  
276

  
277
    __asm__ volatile(
278
        "1:\n\t"
279
        "add %4, %1\n\t"
280
        /* mm1 = A * src[0,1] + B * src[1,2] */
281
        "movq    %%mm2, %%mm1\n\t"
282
        "pmaddwd %%mm5, %%mm1\n\t"
283
        /* mm0 = src[0,1,1,2] */
284
        "movd (%1), %%mm0\n\t"
285
        "punpcklbw %%mm7, %%mm0\n\t"
286
        "pshufw $0x94, %%mm0, %%mm0\n\t"
287
        /* mm1 += C * src[0,1] + D * src[1,2] */
288
        "movq    %%mm0, %%mm2\n\t"
289
        "pmaddwd %%mm6, %%mm0\n\t"
290
        "paddw      %3, %%mm1\n\t"
291
        "paddw   %%mm0, %%mm1\n\t"
292
        /* dst[0,1] = pack((mm1 + 32) >> 6) */
293
        "psrlw $6, %%mm1\n\t"
294
        "packssdw %%mm7, %%mm1\n\t"
295
        "packuswb %%mm7, %%mm1\n\t"
296
        H264_CHROMA_OP4((%0), %%mm1, %%mm3)
297
        "movd %%mm1, %%esi\n\t"
298
        "movw %%si, (%0)\n\t"
299
        "add %4, %0\n\t"
300
        "sub $1, %2\n\t"
301
        "jnz 1b\n\t"
302
        : "+r" (dst), "+r"(src), "+r"(h)
303
        : "m" (ff_pw_32), "r"((x86_reg)stride)
304
        : "%esi");
305

  
306
}
307
#endif
308

  
libavcodec/i386/dsputil_h264_template_ssse3.c
1
/*
2
 * Copyright (c) 2008 Loren Merritt
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

  
21
/**
22
 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
23
 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
24
 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
25
 * AVG_OP must be defined to empty for put and the identify for avg
26
 */
27
static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
28
{
29
    if(y==0 && x==0) {
30
        /* no filter needed */
31
        H264_CHROMA_MC8_MV0(dst, src, stride, h);
32
        return;
33
    }
34

  
35
    assert(x<8 && y<8 && x>=0 && y>=0);
36

  
37
    if(y==0 || x==0)
38
    {
39
        /* 1 dimensional filter only */
40
        __asm__ volatile(
41
            "movd %0, %%xmm7 \n\t"
42
            "movq %1, %%xmm6 \n\t"
43
            "pshuflw $0, %%xmm7, %%xmm7 \n\t"
44
            "movlhps %%xmm6, %%xmm6 \n\t"
45
            "movlhps %%xmm7, %%xmm7 \n\t"
46
            :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
47
        );
48

  
49
        if(x) {
50
            __asm__ volatile(
51
                "1: \n\t"
52
                "movq (%1), %%xmm0 \n\t"
53
                "movq 1(%1), %%xmm1 \n\t"
54
                "movq (%1,%3), %%xmm2 \n\t"
55
                "movq 1(%1,%3), %%xmm3 \n\t"
56
                "punpcklbw %%xmm1, %%xmm0 \n\t"
57
                "punpcklbw %%xmm3, %%xmm2 \n\t"
58
                "pmaddubsw %%xmm7, %%xmm0 \n\t"
59
                "pmaddubsw %%xmm7, %%xmm2 \n\t"
60
         AVG_OP("movq (%0), %%xmm4 \n\t")
61
         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
62
                "paddw %%xmm6, %%xmm0 \n\t"
63
                "paddw %%xmm6, %%xmm2 \n\t"
64
                "psrlw $3, %%xmm0 \n\t"
65
                "psrlw $3, %%xmm2 \n\t"
66
                "packuswb %%xmm2, %%xmm0 \n\t"
67
         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
68
                "movq %%xmm0, (%0) \n\t"
69
                "movhps %%xmm0, (%0,%3) \n\t"
70
                "sub $2, %2 \n\t"
71
                "lea (%1,%3,2), %1 \n\t"
72
                "lea (%0,%3,2), %0 \n\t"
73
                "jg 1b \n\t"
74
                :"+r"(dst), "+r"(src), "+r"(h)
75
                :"r"((x86_reg)stride)
76
            );
77
        } else {
78
            __asm__ volatile(
79
                "1: \n\t"
80
                "movq (%1), %%xmm0 \n\t"
81
                "movq (%1,%3), %%xmm1 \n\t"
82
                "movdqa %%xmm1, %%xmm2 \n\t"
83
                "movq (%1,%3,2), %%xmm3 \n\t"
84
                "punpcklbw %%xmm1, %%xmm0 \n\t"
85
                "punpcklbw %%xmm3, %%xmm2 \n\t"
86
                "pmaddubsw %%xmm7, %%xmm0 \n\t"
87
                "pmaddubsw %%xmm7, %%xmm2 \n\t"
88
         AVG_OP("movq (%0), %%xmm4 \n\t")
89
         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
90
                "paddw %%xmm6, %%xmm0 \n\t"
91
                "paddw %%xmm6, %%xmm2 \n\t"
92
                "psrlw $3, %%xmm0 \n\t"
93
                "psrlw $3, %%xmm2 \n\t"
94
                "packuswb %%xmm2, %%xmm0 \n\t"
95
         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
96
                "movq %%xmm0, (%0) \n\t"
97
                "movhps %%xmm0, (%0,%3) \n\t"
98
                "sub $2, %2 \n\t"
99
                "lea (%1,%3,2), %1 \n\t"
100
                "lea (%0,%3,2), %0 \n\t"
101
                "jg 1b \n\t"
102
                :"+r"(dst), "+r"(src), "+r"(h)
103
                :"r"((x86_reg)stride)
104
            );
105
        }
106
        return;
107
    }
108

  
109
    /* general case, bilinear */
110
    __asm__ volatile(
111
        "movd %0, %%xmm7 \n\t"
112
        "movd %1, %%xmm6 \n\t"
113
        "movdqa %2, %%xmm5 \n\t"
114
        "pshuflw $0, %%xmm7, %%xmm7 \n\t"
115
        "pshuflw $0, %%xmm6, %%xmm6 \n\t"
116
        "movlhps %%xmm7, %%xmm7 \n\t"
117
        "movlhps %%xmm6, %%xmm6 \n\t"
118
        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
119
    );
120

  
121
    __asm__ volatile(
122
        "movq (%1), %%xmm0 \n\t"
123
        "movq 1(%1), %%xmm1 \n\t"
124
        "punpcklbw %%xmm1, %%xmm0 \n\t"
125
        "add %3, %1 \n\t"
126
        "1: \n\t"
127
        "movq (%1), %%xmm1 \n\t"
128
        "movq 1(%1), %%xmm2 \n\t"
129
        "movq (%1,%3), %%xmm3 \n\t"
130
        "movq 1(%1,%3), %%xmm4 \n\t"
131
        "lea (%1,%3,2), %1 \n\t"
132
        "punpcklbw %%xmm2, %%xmm1 \n\t"
133
        "punpcklbw %%xmm4, %%xmm3 \n\t"
134
        "movdqa %%xmm1, %%xmm2 \n\t"
135
        "movdqa %%xmm3, %%xmm4 \n\t"
136
        "pmaddubsw %%xmm7, %%xmm0 \n\t"
137
        "pmaddubsw %%xmm6, %%xmm1 \n\t"
138
        "pmaddubsw %%xmm7, %%xmm2 \n\t"
139
        "pmaddubsw %%xmm6, %%xmm3 \n\t"
140
        "paddw %%xmm5, %%xmm0 \n\t"
141
        "paddw %%xmm5, %%xmm2 \n\t"
142
        "paddw %%xmm0, %%xmm1 \n\t"
143
        "paddw %%xmm2, %%xmm3 \n\t"
144
        "movdqa %%xmm4, %%xmm0 \n\t"
145
        "psrlw $6, %%xmm1 \n\t"
146
        "psrlw $6, %%xmm3 \n\t"
147
 AVG_OP("movq (%0), %%xmm2 \n\t")
148
 AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
149
        "packuswb %%xmm3, %%xmm1 \n\t"
150
 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
151
        "movq %%xmm1, (%0)\n\t"
152
        "movhps %%xmm1, (%0,%3)\n\t"
153
        "sub $2, %2 \n\t"
154
        "lea (%0,%3,2), %0 \n\t"
155
        "jg 1b \n\t"
156
        :"+r"(dst), "+r"(src), "+r"(h)
157
        :"r"((x86_reg)stride)
158
    );
159
}
160

  
161
static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
162
{
163
    __asm__ volatile(
164
        "movd %0, %%mm7 \n\t"
165
        "movd %1, %%mm6 \n\t"
166
        "movq %2, %%mm5 \n\t"
167
        "pshufw $0, %%mm7, %%mm7 \n\t"
168
        "pshufw $0, %%mm6, %%mm6 \n\t"
169
        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
170
    );
171

  
172
    __asm__ volatile(
173
        "movd (%1), %%mm0 \n\t"
174
        "punpcklbw 1(%1), %%mm0 \n\t"
175
        "add %3, %1 \n\t"
176
        "1: \n\t"
177
        "movd (%1), %%mm1 \n\t"
178
        "movd (%1,%3), %%mm3 \n\t"
179
        "punpcklbw 1(%1), %%mm1 \n\t"
180
        "punpcklbw 1(%1,%3), %%mm3 \n\t"
181
        "lea (%1,%3,2), %1 \n\t"
182
        "movq %%mm1, %%mm2 \n\t"
183
        "movq %%mm3, %%mm4 \n\t"
184
        "pmaddubsw %%mm7, %%mm0 \n\t"
185
        "pmaddubsw %%mm6, %%mm1 \n\t"
186
        "pmaddubsw %%mm7, %%mm2 \n\t"
187
        "pmaddubsw %%mm6, %%mm3 \n\t"
188
        "paddw %%mm5, %%mm0 \n\t"
189
        "paddw %%mm5, %%mm2 \n\t"
190
        "paddw %%mm0, %%mm1 \n\t"
191
        "paddw %%mm2, %%mm3 \n\t"
192
        "movq %%mm4, %%mm0 \n\t"
193
        "psrlw $6, %%mm1 \n\t"
194
        "psrlw $6, %%mm3 \n\t"
195
        "packuswb %%mm1, %%mm1 \n\t"
196
        "packuswb %%mm3, %%mm3 \n\t"
197
 AVG_OP("pavgb (%0), %%mm1 \n\t")
198
 AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
199
        "movd %%mm1, (%0)\n\t"
200
        "movd %%mm3, (%0,%3)\n\t"
201
        "sub $2, %2 \n\t"
202
        "lea (%0,%3,2), %0 \n\t"
203
        "jg 1b \n\t"
204
        :"+r"(dst), "+r"(src), "+r"(h)
205
        :"r"((x86_reg)stride)
206
    );
207
}
208

  
libavcodec/i386/dsputil_mmx.c
1
/*
2
 * MMX optimized DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 *
22
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23
 */
24

  
25
#include "libavutil/x86_cpu.h"
26
#include "libavcodec/dsputil.h"
27
#include "libavcodec/h263.h"
28
#include "libavcodec/mpegvideo.h"
29
#include "libavcodec/simple_idct.h"
30
#include "dsputil_mmx.h"
31
#include "mmx.h"
32
#include "vp3dsp_mmx.h"
33
#include "vp3dsp_sse2.h"
34
#include "idct_xvid.h"
35

  
36
//#undef NDEBUG
37
//#include <assert.h>
38

  
39
int mm_flags; /* multimedia extension flags */
40

  
41
/* pixel operations */
42
DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
43
DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
44

  
45
DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
46
{0x8000000080000000ULL, 0x8000000080000000ULL};
47

  
48
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
49
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff