Revision be449fca libavcodec/i386/h264dsp_mmx.c

View differences:

libavcodec/i386/h264dsp_mmx.c
57 57
static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
58 58
{
59 59
    /* Load dct coeffs */
60
    asm volatile(
60
    __asm__ volatile(
61 61
        "movq   (%0), %%mm0 \n\t"
62 62
        "movq  8(%0), %%mm1 \n\t"
63 63
        "movq 16(%0), %%mm2 \n\t"
64 64
        "movq 24(%0), %%mm3 \n\t"
65 65
    :: "r"(block) );
66 66

  
67
    asm volatile(
67
    __asm__ volatile(
68 68
        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
69 69
        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
70 70

  
......
80 80
        "pxor %%mm7, %%mm7    \n\t"
81 81
    :: "m"(ff_pw_32));
82 82

  
83
    asm volatile(
83
    __asm__ volatile(
84 84
    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
85 85
        "add %1, %0             \n\t"
86 86
    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
......
95 95

  
96 96
static inline void h264_idct8_1d(int16_t *block)
97 97
{
98
    asm volatile(
98
    __asm__ volatile(
99 99
        "movq 112(%0), %%mm7  \n\t"
100 100
        "movq  80(%0), %%mm0  \n\t"
101 101
        "movq  48(%0), %%mm3  \n\t"
......
166 166

  
167 167
        h264_idct8_1d(block+4*i);
168 168

  
169
        asm volatile(
169
        __asm__ volatile(
170 170
            "movq   %%mm7,    %0   \n\t"
171 171
            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
172 172
            "movq   %%mm0,  8(%1)  \n\t"
......
188 188
    for(i=0; i<2; i++){
189 189
        h264_idct8_1d(b2+4*i);
190 190

  
191
        asm volatile(
191
        __asm__ volatile(
192 192
            "psraw     $6, %%mm7  \n\t"
193 193
            "psraw     $6, %%mm6  \n\t"
194 194
            "psraw     $6, %%mm5  \n\t"
......
269 269

  
270 270
static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
271 271
{
272
    asm volatile(
272
    __asm__ volatile(
273 273
        "movdqa   0x10(%1), %%xmm1 \n"
274 274
        "movdqa   0x20(%1), %%xmm2 \n"
275 275
        "movdqa   0x30(%1), %%xmm3 \n"
......
304 304
static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
305 305
{
306 306
    int dc = (block[0] + 32) >> 6;
307
    asm volatile(
307
    __asm__ volatile(
308 308
        "movd          %0, %%mm0 \n\t"
309 309
        "pshufw $0, %%mm0, %%mm0 \n\t"
310 310
        "pxor       %%mm1, %%mm1 \n\t"
......
313 313
        "packuswb   %%mm1, %%mm1 \n\t"
314 314
        ::"r"(dc)
315 315
    );
316
    asm volatile(
316
    __asm__ volatile(
317 317
        "movd          %0, %%mm2 \n\t"
318 318
        "movd          %1, %%mm3 \n\t"
319 319
        "movd          %2, %%mm4 \n\t"
......
341 341
{
342 342
    int dc = (block[0] + 32) >> 6;
343 343
    int y;
344
    asm volatile(
344
    __asm__ volatile(
345 345
        "movd          %0, %%mm0 \n\t"
346 346
        "pshufw $0, %%mm0, %%mm0 \n\t"
347 347
        "pxor       %%mm1, %%mm1 \n\t"
......
351 351
        ::"r"(dc)
352 352
    );
353 353
    for(y=2; y--; dst += 4*stride){
354
    asm volatile(
354
    __asm__ volatile(
355 355
        "movq          %0, %%mm2 \n\t"
356 356
        "movq          %1, %%mm3 \n\t"
357 357
        "movq          %2, %%mm4 \n\t"
......
463 463
{
464 464
    DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
465 465

  
466
    asm volatile(
466
    __asm__ volatile(
467 467
        "movq    (%1,%3), %%mm0    \n\t" //p1
468 468
        "movq    (%1,%3,2), %%mm1  \n\t" //p0
469 469
        "movq    (%2),    %%mm2    \n\t" //q0
......
540 540

  
541 541
static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
542 542
{
543
    asm volatile(
543
    __asm__ volatile(
544 544
        "movq    (%0),    %%mm0     \n\t" //p1
545 545
        "movq    (%0,%2), %%mm1     \n\t" //p0
546 546
        "movq    (%1),    %%mm2     \n\t" //q0
......
586 586

  
587 587
static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
588 588
{
589
    asm volatile(
589
    __asm__ volatile(
590 590
        "movq    (%0),    %%mm0     \n\t"
591 591
        "movq    (%0,%2), %%mm1     \n\t"
592 592
        "movq    (%1),    %%mm2     \n\t"
......
628 628
static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
629 629
                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
630 630
    int dir;
631
    asm volatile(
631
    __asm__ volatile(
632 632
        "pxor %%mm7, %%mm7 \n\t"
633 633
        "movq %0, %%mm6 \n\t"
634 634
        "movq %1, %%mm5 \n\t"
......
636 636
        ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
637 637
    );
638 638
    if(field)
639
        asm volatile(
639
        __asm__ volatile(
640 640
            "movq %0, %%mm5 \n\t"
641 641
            "movq %1, %%mm4 \n\t"
642 642
            ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
......
650 650
        DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
651 651
        int b_idx, edge, l;
652 652
        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
653
            asm volatile(
653
            __asm__ volatile(
654 654
                "pand %0, %%mm0 \n\t"
655 655
                ::"m"(mask_dir)
656 656
            );
657 657
            if(!(mask_mv & edge)) {
658
                asm volatile("pxor %%mm0, %%mm0 \n\t":);
658
                __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
659 659
                for( l = bidir; l >= 0; l-- ) {
660
                    asm volatile(
660
                    __asm__ volatile(
661 661
                        "movd %0, %%mm1 \n\t"
662 662
                        "punpckldq %1, %%mm1 \n\t"
663 663
                        "movq %%mm1, %%mm2 \n\t"
......
688 688
                    );
689 689
                }
690 690
            }
691
            asm volatile(
691
            __asm__ volatile(
692 692
                "movd %0, %%mm1 \n\t"
693 693
                "por  %1, %%mm1 \n\t"
694 694
                "punpcklbw %%mm7, %%mm1 \n\t"
......
696 696
                ::"m"(nnz[b_idx]),
697 697
                  "m"(nnz[b_idx+d_idx])
698 698
            );
699
            asm volatile(
699
            __asm__ volatile(
700 700
                "pcmpeqw %%mm7, %%mm0 \n\t"
701 701
                "pcmpeqw %%mm7, %%mm0 \n\t"
702 702
                "psrlw $15, %%mm0 \n\t" // nonzero -> 1
......
713 713
        edges = 4;
714 714
        step = 1;
715 715
    }
716
    asm volatile(
716
    __asm__ volatile(
717 717
        "movq   (%0), %%mm0 \n\t"
718 718
        "movq  8(%0), %%mm1 \n\t"
719 719
        "movq 16(%0), %%mm2 \n\t"
......
774 774
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
775 775
    int h=4;\
776 776
\
777
    asm volatile(\
777
    __asm__ volatile(\
778 778
        "pxor %%mm7, %%mm7          \n\t"\
779 779
        "movq %5, %%mm4             \n\t"\
780 780
        "movq %6, %%mm5             \n\t"\
......
813 813
}\
814 814
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
815 815
    int h=4;\
816
    asm volatile(\
816
    __asm__ volatile(\
817 817
        "pxor %%mm7, %%mm7          \n\t"\
818 818
        "movq %0, %%mm4             \n\t"\
819 819
        "movq %1, %%mm5             \n\t"\
820 820
        :: "m"(ff_pw_5), "m"(ff_pw_16)\
821 821
    );\
822 822
    do{\
823
    asm volatile(\
823
    __asm__ volatile(\
824 824
        "movd  -1(%0), %%mm1        \n\t"\
825 825
        "movd    (%0), %%mm2        \n\t"\
826 826
        "movd   1(%0), %%mm3        \n\t"\
......
857 857
}\
858 858
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
859 859
    src -= 2*srcStride;\
860
    asm volatile(\
860
    __asm__ volatile(\
861 861
        "pxor %%mm7, %%mm7          \n\t"\
862 862
        "movd (%0), %%mm0           \n\t"\
863 863
        "add %2, %0                 \n\t"\
......
889 889
    int w=3;\
890 890
    src -= 2*srcStride+2;\
891 891
    while(w--){\
892
        asm volatile(\
892
        __asm__ volatile(\
893 893
            "pxor %%mm7, %%mm7      \n\t"\
894 894
            "movd (%0), %%mm0       \n\t"\
895 895
            "add %2, %0             \n\t"\
......
919 919
        src += 4 - 9*srcStride;\
920 920
    }\
921 921
    tmp -= 3*4;\
922
    asm volatile(\
922
    __asm__ volatile(\
923 923
        "1:                         \n\t"\
924 924
        "movq     (%0), %%mm0       \n\t"\
925 925
        "paddw  10(%0), %%mm0       \n\t"\
......
948 948
\
949 949
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
950 950
    int h=8;\
951
    asm volatile(\
951
    __asm__ volatile(\
952 952
        "pxor %%mm7, %%mm7          \n\t"\
953 953
        "movq %5, %%mm6             \n\t"\
954 954
        "1:                         \n\t"\
......
1005 1005
\
1006 1006
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1007 1007
    int h=8;\
1008
    asm volatile(\
1008
    __asm__ volatile(\
1009 1009
        "pxor %%mm7, %%mm7          \n\t"\
1010 1010
        "movq %0, %%mm6             \n\t"\
1011 1011
        :: "m"(ff_pw_5)\
1012 1012
    );\
1013 1013
    do{\
1014
    asm volatile(\
1014
    __asm__ volatile(\
1015 1015
        "movq    (%0), %%mm0        \n\t"\
1016 1016
        "movq   1(%0), %%mm2        \n\t"\
1017 1017
        "movq %%mm0, %%mm1          \n\t"\
......
1071 1071
    src -= 2*srcStride;\
1072 1072
    \
1073 1073
    while(w--){\
1074
      asm volatile(\
1074
      __asm__ volatile(\
1075 1075
        "pxor %%mm7, %%mm7          \n\t"\
1076 1076
        "movd (%0), %%mm0           \n\t"\
1077 1077
        "add %2, %0                 \n\t"\
......
1102 1102
        : "memory"\
1103 1103
     );\
1104 1104
     if(h==16){\
1105
        asm volatile(\
1105
        __asm__ volatile(\
1106 1106
            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
1107 1107
            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
1108 1108
            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
......
1125 1125
    int w = (size+8)>>2;\
1126 1126
    src -= 2*srcStride+2;\
1127 1127
    while(w--){\
1128
        asm volatile(\
1128
        __asm__ volatile(\
1129 1129
            "pxor %%mm7, %%mm7      \n\t"\
1130 1130
            "movd (%0), %%mm0       \n\t"\
1131 1131
            "add %2, %0             \n\t"\
......
1155 1155
            : "memory"\
1156 1156
        );\
1157 1157
        if(size==16){\
1158
            asm volatile(\
1158
            __asm__ volatile(\
1159 1159
                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
1160 1160
                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
1161 1161
                QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
......
1177 1177
    int w = size>>4;\
1178 1178
    do{\
1179 1179
    int h = size;\
1180
    asm volatile(\
1180
    __asm__ volatile(\
1181 1181
        "1:                         \n\t"\
1182 1182
        "movq     (%0), %%mm0       \n\t"\
1183 1183
        "movq    8(%0), %%mm3       \n\t"\
......
1261 1261
\
1262 1262
static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
1263 1263
{\
1264
    asm volatile(\
1264
    __asm__ volatile(\
1265 1265
        "movq      (%1), %%mm0          \n\t"\
1266 1266
        "movq    24(%1), %%mm1          \n\t"\
1267 1267
        "psraw      $5,  %%mm0          \n\t"\
......
1291 1291
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
1292 1292
{\
1293 1293
    do{\
1294
    asm volatile(\
1294
    __asm__ volatile(\
1295 1295
        "movq      (%1), %%mm0          \n\t"\
1296 1296
        "movq     8(%1), %%mm1          \n\t"\
1297 1297
        "movq    48(%1), %%mm2          \n\t"\
......
1325 1325
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1326 1326
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1327 1327
    int h=16;\
1328
    asm volatile(\
1328
    __asm__ volatile(\
1329 1329
        "pxor %%xmm15, %%xmm15      \n\t"\
1330 1330
        "movdqa %6, %%xmm14         \n\t"\
1331 1331
        "movdqa %7, %%xmm13         \n\t"\
......
1403 1403
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
1404 1404
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1405 1405
    int h=8;\
1406
    asm volatile(\
1406
    __asm__ volatile(\
1407 1407
        "pxor %%xmm7, %%xmm7        \n\t"\
1408 1408
        "movdqa %0, %%xmm6          \n\t"\
1409 1409
        :: "m"(ff_pw_5)\
1410 1410
    );\
1411 1411
    do{\
1412
    asm volatile(\
1412
    __asm__ volatile(\
1413 1413
        "lddqu   -5(%0), %%xmm1     \n\t"\
1414 1414
        "movdqa  %%xmm1, %%xmm0     \n\t"\
1415 1415
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
......
1450 1450
\
1451 1451
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1452 1452
    int h=8;\
1453
    asm volatile(\
1453
    __asm__ volatile(\
1454 1454
        "pxor %%xmm7, %%xmm7        \n\t"\
1455 1455
        "movdqa %5, %%xmm6          \n\t"\
1456 1456
        "1:                         \n\t"\
......
1501 1501
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1502 1502
    src -= 2*srcStride;\
1503 1503
    \
1504
    asm volatile(\
1504
    __asm__ volatile(\
1505 1505
        "pxor %%xmm7, %%xmm7        \n\t"\
1506 1506
        "movq (%0), %%xmm0          \n\t"\
1507 1507
        "add %2, %0                 \n\t"\
......
1532 1532
        : "memory"\
1533 1533
    );\
1534 1534
    if(h==16){\
1535
        asm volatile(\
1535
        __asm__ volatile(\
1536 1536
            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
1537 1537
            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
1538 1538
            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
......
1560 1560
    int w = (size+8)>>3;
1561 1561
    src -= 2*srcStride+2;
1562 1562
    while(w--){
1563
        asm volatile(
1563
        __asm__ volatile(
1564 1564
            "pxor %%xmm7, %%xmm7        \n\t"
1565 1565
            "movq (%0), %%xmm0          \n\t"
1566 1566
            "add %2, %0                 \n\t"
......
1590 1590
            : "memory"
1591 1591
        );
1592 1592
        if(size==16){
1593
            asm volatile(
1593
            __asm__ volatile(
1594 1594
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
1595 1595
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
1596 1596
                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
......
1613 1613
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
1614 1614
    int h = size;\
1615 1615
    if(size == 16){\
1616
        asm volatile(\
1616
        __asm__ volatile(\
1617 1617
            "1:                         \n\t"\
1618 1618
            "movdqa 32(%0), %%xmm4      \n\t"\
1619 1619
            "movdqa 16(%0), %%xmm5      \n\t"\
......
1668 1668
            : "memory"\
1669 1669
        );\
1670 1670
    }else{\
1671
        asm volatile(\
1671
        __asm__ volatile(\
1672 1672
            "1:                         \n\t"\
1673 1673
            "movdqa 16(%0), %%xmm1      \n\t"\
1674 1674
            "movdqa   (%0), %%xmm0      \n\t"\
......
2022 2022
    int x, y;
2023 2023
    offset <<= log2_denom;
2024 2024
    offset += (1 << log2_denom) >> 1;
2025
    asm volatile(
2025
    __asm__ volatile(
2026 2026
        "movd    %0, %%mm4        \n\t"
2027 2027
        "movd    %1, %%mm5        \n\t"
2028 2028
        "movd    %2, %%mm6        \n\t"
......
2033 2033
    );
2034 2034
    for(y=0; y<h; y+=2){
2035 2035
        for(x=0; x<w; x+=4){
2036
            asm volatile(
2036
            __asm__ volatile(
2037 2037
                "movd      %0,    %%mm0 \n\t"
2038 2038
                "movd      %1,    %%mm1 \n\t"
2039 2039
                "punpcklbw %%mm7, %%mm0 \n\t"
......
2060 2060
{
2061 2061
    int x, y;
2062 2062
    offset = ((offset + 1) | 1) << log2_denom;
2063
    asm volatile(
2063
    __asm__ volatile(
2064 2064
        "movd    %0, %%mm3        \n\t"
2065 2065
        "movd    %1, %%mm4        \n\t"
2066 2066
        "movd    %2, %%mm5        \n\t"
......
2073 2073
    );
2074 2074
    for(y=0; y<h; y++){
2075 2075
        for(x=0; x<w; x+=4){
2076
            asm volatile(
2076
            __asm__ volatile(
2077 2077
                "movd      %0,    %%mm0 \n\t"
2078 2078
                "movd      %1,    %%mm1 \n\t"
2079 2079
                "punpcklbw %%mm7, %%mm0 \n\t"

Also available in: Unified diff