Revision be449fca libavcodec/i386/dsputil_mmx.c

View differences:

libavcodec/i386/dsputil_mmx.c
70 70
DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71 71
DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
72 72

  
73
#define JUMPALIGN() asm volatile (ASMALIGN(3)::)
74
#define MOVQ_ZERO(regd)  asm volatile ("pxor %%" #regd ", %%" #regd ::)
73
#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
74
#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
75 75

  
76 76
#define MOVQ_BFE(regd) \
77
    asm volatile ( \
77
    __asm__ volatile ( \
78 78
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
79 79
    "paddb %%" #regd ", %%" #regd " \n\t" ::)
80 80

  
81 81
#ifndef PIC
82
#define MOVQ_BONE(regd)  asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
83
#define MOVQ_WTWO(regd)  asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
82
#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
83
#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
84 84
#else
85 85
// for shared library it's better to use this way for accessing constants
86 86
// pcmpeqd -> -1
87 87
#define MOVQ_BONE(regd) \
88
    asm volatile ( \
88
    __asm__ volatile ( \
89 89
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
90 90
    "psrlw $15, %%" #regd " \n\t" \
91 91
    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
92 92

  
93 93
#define MOVQ_WTWO(regd) \
94
    asm volatile ( \
94
    __asm__ volatile ( \
95 95
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
96 96
    "psrlw $15, %%" #regd " \n\t" \
97 97
    "psllw $1, %%" #regd " \n\t"::)
......
223 223
    p = block;
224 224
    pix = pixels;
225 225
    /* unrolled loop */
226
        asm volatile(
226
        __asm__ volatile(
227 227
                "movq   %3, %%mm0               \n\t"
228 228
                "movq   8%3, %%mm1              \n\t"
229 229
                "movq   16%3, %%mm2             \n\t"
......
248 248
    // if here would be an exact copy of the code above
249 249
    // compiler would generate some very strange code
250 250
    // thus using "r"
251
    asm volatile(
251
    __asm__ volatile(
252 252
            "movq       (%3), %%mm0             \n\t"
253 253
            "movq       8(%3), %%mm1            \n\t"
254 254
            "movq       16(%3), %%mm2           \n\t"
......
299 299
    MOVQ_ZERO(mm7);
300 300
    i = 4;
301 301
    do {
302
        asm volatile(
302
        __asm__ volatile(
303 303
                "movq   (%2), %%mm0     \n\t"
304 304
                "movq   8(%2), %%mm1    \n\t"
305 305
                "movq   16(%2), %%mm2   \n\t"
......
330 330

  
331 331
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
332 332
{
333
    asm volatile(
333
    __asm__ volatile(
334 334
         "lea (%3, %3), %%"REG_a"       \n\t"
335 335
         ASMALIGN(3)
336 336
         "1:                            \n\t"
......
356 356

  
357 357
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
358 358
{
359
    asm volatile(
359
    __asm__ volatile(
360 360
         "lea (%3, %3), %%"REG_a"       \n\t"
361 361
         ASMALIGN(3)
362 362
         "1:                            \n\t"
......
382 382

  
383 383
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
384 384
{
385
    asm volatile(
385
    __asm__ volatile(
386 386
         "lea (%3, %3), %%"REG_a"       \n\t"
387 387
         ASMALIGN(3)
388 388
         "1:                            \n\t"
......
416 416

  
417 417
static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
418 418
{
419
    asm volatile(
419
    __asm__ volatile(
420 420
         "1:                            \n\t"
421 421
         "movdqu (%1), %%xmm0           \n\t"
422 422
         "movdqu (%1,%3), %%xmm1        \n\t"
......
438 438

  
439 439
static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
440 440
{
441
    asm volatile(
441
    __asm__ volatile(
442 442
         "1:                            \n\t"
443 443
         "movdqu (%1), %%xmm0           \n\t"
444 444
         "movdqu (%1,%3), %%xmm1        \n\t"
......
464 464

  
465 465
static void clear_blocks_mmx(DCTELEM *blocks)
466 466
{
467
    asm volatile(
467
    __asm__ volatile(
468 468
                "pxor %%mm7, %%mm7              \n\t"
469 469
                "mov $-128*6, %%"REG_a"         \n\t"
470 470
                "1:                             \n\t"
......
481 481

  
482 482
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
483 483
    x86_reg i=0;
484
    asm volatile(
484
    __asm__ volatile(
485 485
        "jmp 2f                         \n\t"
486 486
        "1:                             \n\t"
487 487
        "movq  (%1, %0), %%mm0          \n\t"
......
505 505

  
506 506
static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
507 507
    x86_reg i=0;
508
    asm volatile(
508
    __asm__ volatile(
509 509
        "jmp 2f                         \n\t"
510 510
        "1:                             \n\t"
511 511
        "movq   (%2, %0), %%mm0         \n\t"
......
600 600
    if(ENABLE_ANY_H263) {
601 601
    const int strength= ff_h263_loop_filter_strength[qscale];
602 602

  
603
    asm volatile(
603
    __asm__ volatile(
604 604

  
605 605
        H263_LOOP_FILTER
606 606

  
......
618 618
}
619 619

  
620 620
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
621
    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
621
    __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
622 622
        "movd  %4, %%mm0                \n\t"
623 623
        "movd  %5, %%mm1                \n\t"
624 624
        "movd  %6, %%mm2                \n\t"
......
656 656

  
657 657
    transpose4x4(btemp  , src           , 8, stride);
658 658
    transpose4x4(btemp+4, src + 4*stride, 8, stride);
659
    asm volatile(
659
    __asm__ volatile(
660 660
        H263_LOOP_FILTER // 5 3 4 6
661 661

  
662 662
        : "+m" (temp[0]),
......
666 666
        : "g" (2*strength), "m"(ff_pb_FC)
667 667
    );
668 668

  
669
    asm volatile(
669
    __asm__ volatile(
670 670
        "movq %%mm5, %%mm1              \n\t"
671 671
        "movq %%mm4, %%mm0              \n\t"
672 672
        "punpcklbw %%mm3, %%mm5         \n\t"
......
711 711
    ptr = buf;
712 712
    if(w==8)
713 713
    {
714
        asm volatile(
714
        __asm__ volatile(
715 715
                "1:                             \n\t"
716 716
                "movd (%0), %%mm0               \n\t"
717 717
                "punpcklbw %%mm0, %%mm0         \n\t"
......
732 732
    }
733 733
    else
734 734
    {
735
        asm volatile(
735
        __asm__ volatile(
736 736
                "1:                             \n\t"
737 737
                "movd (%0), %%mm0               \n\t"
738 738
                "punpcklbw %%mm0, %%mm0         \n\t"
......
757 757
    for(i=0;i<w;i+=4) {
758 758
        /* top and bottom (and hopefully also the corners) */
759 759
        ptr= buf - (i + 1) * wrap - w;
760
        asm volatile(
760
        __asm__ volatile(
761 761
                "1:                             \n\t"
762 762
                "movq (%1, %0), %%mm0           \n\t"
763 763
                "movq %%mm0, (%0)               \n\t"
......
771 771
                : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
772 772
        );
773 773
        ptr= last_line + (i + 1) * wrap - w;
774
        asm volatile(
774
        __asm__ volatile(
775 775
                "1:                             \n\t"
776 776
                "movq (%1, %0), %%mm0           \n\t"
777 777
                "movq %%mm0, (%0)               \n\t"
......
792 792
{\
793 793
    x86_reg i = -bpp;\
794 794
    x86_reg end = w-3;\
795
    asm volatile(\
795
    __asm__ volatile(\
796 796
        "pxor      %%mm7, %%mm7 \n"\
797 797
        "movd    (%1,%0), %%mm0 \n"\
798 798
        "movd    (%2,%0), %%mm1 \n"\
......
886 886
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
887 887
    uint64_t temp;\
888 888
\
889
    asm volatile(\
889
    __asm__ volatile(\
890 890
        "pxor %%mm7, %%mm7                \n\t"\
891 891
        "1:                               \n\t"\
892 892
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
......
1025 1025
        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1026 1026
        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1027 1027
        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1028
        asm volatile(\
1028
        __asm__ volatile(\
1029 1029
            "movq (%0), %%mm0               \n\t"\
1030 1030
            "movq 8(%0), %%mm1              \n\t"\
1031 1031
            "paddw %2, %%mm0                \n\t"\
......
1051 1051
}\
1052 1052
\
1053 1053
static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1054
    asm volatile(\
1054
    __asm__ volatile(\
1055 1055
        "pxor %%mm7, %%mm7                \n\t"\
1056 1056
        "1:                               \n\t"\
1057 1057
        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
......
1128 1128
        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1129 1129
        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1130 1130
        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1131
        asm volatile(\
1131
        __asm__ volatile(\
1132 1132
            "movq (%0), %%mm0           \n\t"\
1133 1133
            "movq 8(%0), %%mm1          \n\t"\
1134 1134
            "paddw %2, %%mm0            \n\t"\
......
1153 1153
    int count= 17;\
1154 1154
\
1155 1155
    /*FIXME unroll */\
1156
    asm volatile(\
1156
    __asm__ volatile(\
1157 1157
        "pxor %%mm7, %%mm7              \n\t"\
1158 1158
        "1:                             \n\t"\
1159 1159
        "movq (%0), %%mm0               \n\t"\
......
1181 1181
    count=4;\
1182 1182
    \
1183 1183
/*FIXME reorder for speed */\
1184
    asm volatile(\
1184
    __asm__ volatile(\
1185 1185
        /*"pxor %%mm7, %%mm7              \n\t"*/\
1186 1186
        "1:                             \n\t"\
1187 1187
        "movq (%0), %%mm0               \n\t"\
......
1231 1231
    int count= 9;\
1232 1232
\
1233 1233
    /*FIXME unroll */\
1234
    asm volatile(\
1234
    __asm__ volatile(\
1235 1235
        "pxor %%mm7, %%mm7              \n\t"\
1236 1236
        "1:                             \n\t"\
1237 1237
        "movq (%0), %%mm0               \n\t"\
......
1253 1253
    count=2;\
1254 1254
    \
1255 1255
/*FIXME reorder for speed */\
1256
    asm volatile(\
1256
    __asm__ volatile(\
1257 1257
        /*"pxor %%mm7, %%mm7              \n\t"*/\
1258 1258
        "1:                             \n\t"\
1259 1259
        "movq (%0), %%mm0               \n\t"\
......
1620 1620
        src = edge_buf;
1621 1621
    }
1622 1622

  
1623
    asm volatile(
1623
    __asm__ volatile(
1624 1624
        "movd         %0, %%mm6 \n\t"
1625 1625
        "pxor      %%mm7, %%mm7 \n\t"
1626 1626
        "punpcklwd %%mm6, %%mm6 \n\t"
......
1639 1639
                            oys - dyys + dyxs*(x+3) };
1640 1640

  
1641 1641
        for(y=0; y<h; y++){
1642
            asm volatile(
1642
            __asm__ volatile(
1643 1643
                "movq   %0,  %%mm4 \n\t"
1644 1644
                "movq   %1,  %%mm5 \n\t"
1645 1645
                "paddw  %2,  %%mm4 \n\t"
......
1652 1652
                : "m"(*dxy4), "m"(*dyy4)
1653 1653
            );
1654 1654

  
1655
            asm volatile(
1655
            __asm__ volatile(
1656 1656
                "movq   %%mm6, %%mm2 \n\t"
1657 1657
                "movq   %%mm6, %%mm1 \n\t"
1658 1658
                "psubw  %%mm4, %%mm2 \n\t"
......
1701 1701
static void name(void *mem, int stride, int h){\
1702 1702
    const uint8_t *p= mem;\
1703 1703
    do{\
1704
        asm volatile(#op" %0" :: "m"(*p));\
1704
        __asm__ volatile(#op" %0" :: "m"(*p));\
1705 1705
        p+= stride;\
1706 1706
    }while(--h);\
1707 1707
}
......
1787 1787
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
1788 1788
{
1789 1789
    int i;
1790
    asm volatile("pxor %%mm7, %%mm7":);
1790
    __asm__ volatile("pxor %%mm7, %%mm7":);
1791 1791
    for(i=0; i<blocksize; i+=2) {
1792
        asm volatile(
1792
        __asm__ volatile(
1793 1793
            "movq    %0,    %%mm0 \n\t"
1794 1794
            "movq    %1,    %%mm1 \n\t"
1795 1795
            "movq    %%mm0, %%mm2 \n\t"
......
1809 1809
            ::"memory"
1810 1810
        );
1811 1811
    }
1812
    asm volatile("femms");
1812
    __asm__ volatile("femms");
1813 1813
}
1814 1814
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
1815 1815
{
1816 1816
    int i;
1817 1817

  
1818
    asm volatile(
1818
    __asm__ volatile(
1819 1819
            "movaps  %0,     %%xmm5 \n\t"
1820 1820
        ::"m"(ff_pdw_80000000[0])
1821 1821
    );
1822 1822
    for(i=0; i<blocksize; i+=4) {
1823
        asm volatile(
1823
        __asm__ volatile(
1824 1824
            "movaps  %0,     %%xmm0 \n\t"
1825 1825
            "movaps  %1,     %%xmm1 \n\t"
1826 1826
            "xorps   %%xmm2, %%xmm2 \n\t"
......
1846 1846
#define IF0(x)
1847 1847

  
1848 1848
#define MIX5(mono,stereo)\
1849
    asm volatile(\
1849
    __asm__ volatile(\
1850 1850
        "movss          0(%2), %%xmm5 \n"\
1851 1851
        "movss          8(%2), %%xmm6 \n"\
1852 1852
        "movss         24(%2), %%xmm7 \n"\
......
1879 1879
    );
1880 1880

  
1881 1881
#define MIX_MISC(stereo)\
1882
    asm volatile(\
1882
    __asm__ volatile(\
1883 1883
        "1: \n"\
1884 1884
        "movaps  (%3,%0), %%xmm0 \n"\
1885 1885
 stereo("movaps   %%xmm0, %%xmm1 \n")\
......
1919 1919
    } else {
1920 1920
        DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
1921 1921
        j = 2*in_ch*sizeof(float);
1922
        asm volatile(
1922
        __asm__ volatile(
1923 1923
            "1: \n"
1924 1924
            "sub $8, %0 \n"
1925 1925
            "movss     (%2,%0), %%xmm6 \n"
......
1943 1943

  
1944 1944
static void vector_fmul_3dnow(float *dst, const float *src, int len){
1945 1945
    x86_reg i = (len-4)*4;
1946
    asm volatile(
1946
    __asm__ volatile(
1947 1947
        "1: \n\t"
1948 1948
        "movq    (%1,%0), %%mm0 \n\t"
1949 1949
        "movq   8(%1,%0), %%mm1 \n\t"
......
1961 1961
}
1962 1962
static void vector_fmul_sse(float *dst, const float *src, int len){
1963 1963
    x86_reg i = (len-8)*4;
1964
    asm volatile(
1964
    __asm__ volatile(
1965 1965
        "1: \n\t"
1966 1966
        "movaps    (%1,%0), %%xmm0 \n\t"
1967 1967
        "movaps  16(%1,%0), %%xmm1 \n\t"
......
1979 1979

  
1980 1980
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
1981 1981
    x86_reg i = len*4-16;
1982
    asm volatile(
1982
    __asm__ volatile(
1983 1983
        "1: \n\t"
1984 1984
        "pswapd   8(%1), %%mm0 \n\t"
1985 1985
        "pswapd    (%1), %%mm1 \n\t"
......
1993 1993
        :"+r"(i), "+r"(src1)
1994 1994
        :"r"(dst), "r"(src0)
1995 1995
    );
1996
    asm volatile("femms");
1996
    __asm__ volatile("femms");
1997 1997
}
1998 1998
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
1999 1999
    x86_reg i = len*4-32;
2000
    asm volatile(
2000
    __asm__ volatile(
2001 2001
        "1: \n\t"
2002 2002
        "movaps        16(%1), %%xmm0 \n\t"
2003 2003
        "movaps          (%1), %%xmm1 \n\t"
......
2020 2020
    x86_reg i = (len-4)*4;
2021 2021
    if(step == 2 && src3 == 0){
2022 2022
        dst += (len-4)*2;
2023
        asm volatile(
2023
        __asm__ volatile(
2024 2024
            "1: \n\t"
2025 2025
            "movq   (%2,%0),  %%mm0 \n\t"
2026 2026
            "movq  8(%2,%0),  %%mm1 \n\t"
......
2043 2043
        );
2044 2044
    }
2045 2045
    else if(step == 1 && src3 == 0){
2046
        asm volatile(
2046
        __asm__ volatile(
2047 2047
            "1: \n\t"
2048 2048
            "movq    (%2,%0), %%mm0 \n\t"
2049 2049
            "movq   8(%2,%0), %%mm1 \n\t"
......
2062 2062
    }
2063 2063
    else
2064 2064
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
2065
    asm volatile("femms");
2065
    __asm__ volatile("femms");
2066 2066
}
2067 2067
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
2068 2068
                                    const float *src2, int src3, int len, int step){
2069 2069
    x86_reg i = (len-8)*4;
2070 2070
    if(step == 2 && src3 == 0){
2071 2071
        dst += (len-8)*2;
2072
        asm volatile(
2072
        __asm__ volatile(
2073 2073
            "1: \n\t"
2074 2074
            "movaps   (%2,%0), %%xmm0 \n\t"
2075 2075
            "movaps 16(%2,%0), %%xmm1 \n\t"
......
2100 2100
        );
2101 2101
    }
2102 2102
    else if(step == 1 && src3 == 0){
2103
        asm volatile(
2103
        __asm__ volatile(
2104 2104
            "1: \n\t"
2105 2105
            "movaps   (%2,%0), %%xmm0 \n\t"
2106 2106
            "movaps 16(%2,%0), %%xmm1 \n\t"
......
2127 2127
    if(add_bias == 0){
2128 2128
        x86_reg i = -len*4;
2129 2129
        x86_reg j = len*4-8;
2130
        asm volatile(
2130
        __asm__ volatile(
2131 2131
            "1: \n"
2132 2132
            "pswapd  (%5,%1), %%mm1 \n"
2133 2133
            "movq    (%5,%0), %%mm0 \n"
......
2162 2162
    if(add_bias == 0){
2163 2163
        x86_reg i = -len*4;
2164 2164
        x86_reg j = len*4-16;
2165
        asm volatile(
2165
        __asm__ volatile(
2166 2166
            "1: \n"
2167 2167
            "movaps       (%5,%1), %%xmm1 \n"
2168 2168
            "movaps       (%5,%0), %%xmm0 \n"
......
2195 2195
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
2196 2196
{
2197 2197
    x86_reg i = -4*len;
2198
    asm volatile(
2198
    __asm__ volatile(
2199 2199
        "movss  %3, %%xmm4 \n"
2200 2200
        "shufps $0, %%xmm4, %%xmm4 \n"
2201 2201
        "1: \n"
......
2219 2219
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
2220 2220
{
2221 2221
    x86_reg i = -4*len;
2222
    asm volatile(
2222
    __asm__ volatile(
2223 2223
        "movss  %3, %%xmm4 \n"
2224 2224
        "shufps $0, %%xmm4, %%xmm4 \n"
2225 2225
        "1: \n"
......
2238 2238

  
2239 2239
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2240 2240
    // not bit-exact: pf2id uses different rounding than C and SSE
2241
    asm volatile(
2241
    __asm__ volatile(
2242 2242
        "add        %0          , %0        \n\t"
2243 2243
        "lea         (%2,%0,2)  , %2        \n\t"
2244 2244
        "add        %0          , %1        \n\t"
......
2259 2259
    );
2260 2260
}
2261 2261
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
2262
    asm volatile(
2262
    __asm__ volatile(
2263 2263
        "add        %0          , %0        \n\t"
2264 2264
        "lea         (%2,%0,2)  , %2        \n\t"
2265 2265
        "add        %0          , %1        \n\t"
......
2281 2281
}
2282 2282

  
2283 2283
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
2284
    asm volatile(
2284
    __asm__ volatile(
2285 2285
        "add        %0          , %0        \n\t"
2286 2286
        "lea         (%2,%0,2)  , %2        \n\t"
2287 2287
        "add        %0          , %1        \n\t"
......
2326 2326
    else if(channels==2){\
2327 2327
        const float *src0 = src[0];\
2328 2328
        const float *src1 = src[1];\
2329
        asm volatile(\
2329
        __asm__ volatile(\
2330 2330
            "shl $2, %0 \n"\
2331 2331
            "add %0, %1 \n"\
2332 2332
            "add %0, %2 \n"\
......
2412 2412
    x86_reg o = -(order << 1);
2413 2413
    v1 += order;
2414 2414
    v2 += order;
2415
    asm volatile(
2415
    __asm__ volatile(
2416 2416
        "1:                          \n\t"
2417 2417
        "movdqu   (%1,%2),   %%xmm0  \n\t"
2418 2418
        "movdqu 16(%1,%2),   %%xmm1  \n\t"
......
2431 2431
    x86_reg o = -(order << 1);
2432 2432
    v1 += order;
2433 2433
    v2 += order;
2434
    asm volatile(
2434
    __asm__ volatile(
2435 2435
        "1:                           \n\t"
2436 2436
        "movdqa    (%0,%2),   %%xmm0  \n\t"
2437 2437
        "movdqa  16(%0,%2),   %%xmm2  \n\t"
......
2456 2456
    v1 += order;
2457 2457
    v2 += order;
2458 2458
    sh = shift;
2459
    asm volatile(
2459
    __asm__ volatile(
2460 2460
        "pxor      %%xmm7,  %%xmm7        \n\t"
2461 2461
        "1:                               \n\t"
2462 2462
        "movdqu    (%0,%3), %%xmm0        \n\t"

Also available in: Unified diff