Revision ff506a90

View differences:

libavcodec/i386/mpegvideo_mmx.c
673 673
    );
674 674
}
675 675

  
676
#ifdef HAVE_SSSE3
677
#define HAVE_SSSE3_BAK
678
#endif
679
#undef HAVE_SSSE3
680

  
681
#undef HAVE_SSE2
676 682
#undef HAVE_MMX2
677 683
#define RENAME(a) a ## _MMX
678 684
#define RENAMEl(a) a ## _mmx
......
685 691
#define RENAMEl(a) a ## _mmx2
686 692
#include "mpegvideo_mmx_template.c"
687 693

  
694
#define HAVE_SSE2
688 695
#undef RENAME
689 696
#undef RENAMEl
690 697
#define RENAME(a) a ## _SSE2
691 698
#define RENAMEl(a) a ## _sse2
692 699
#include "mpegvideo_mmx_template.c"
693 700

  
701
#ifdef HAVE_SSSE3_BAK
702
#define HAVE_SSSE3
703
#undef RENAME
704
#undef RENAMEl
705
#define RENAME(a) a ## _SSSE3
706
#define RENAMEl(a) a ## _sse2
707
#include "mpegvideo_mmx_template.c"
708
#endif
709

  
694 710
void MPV_common_init_mmx(MpegEncContext *s)
695 711
{
696 712
    if (mm_flags & MM_MMX) {
......
713 729
        }
714 730

  
715 731
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
732
#ifdef HAVE_SSSE3
733
            if(mm_flags & MM_SSSE3){
734
                s->dct_quantize= dct_quantize_SSSE3;
735
            } else
736
#endif
716 737
            if(mm_flags & MM_SSE2){
717 738
                s->dct_quantize= dct_quantize_SSE2;
718 739
            } else if(mm_flags & MM_MMXEXT){
libavcodec/i386/mpegvideo_mmx_template.c
19 19
 * License along with FFmpeg; if not, write to the Free Software
20 20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 21
 */
22

  
23
#undef MMREG_WIDTH
24
#undef MM
25
#undef MOVQ
22 26
#undef SPREADW
23 27
#undef PMAXW
24 28
#undef PMAX
25
#ifdef HAVE_MMX2
26
#define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t"
27
#define PMAXW(a,b) "pmaxsw " #a ", " #b "     \n\t"
29
#undef SAVE_SIGN
30
#undef RESTORE_SIGN
31

  
32
#if defined(HAVE_SSE2)
33
#define MMREG_WIDTH "16"
34
#define MM "%%xmm"
35
#define MOVQ "movdqa"
36
#define SPREADW(a) \
37
            "pshuflw $0, "a", "a"       \n\t"\
38
            "punpcklwd "a", "a"         \n\t"
39
#define PMAXW(a,b) "pmaxsw "a", "b"     \n\t"
28 40
#define PMAX(a,b) \
29
            "pshufw $0x0E," #a ", " #b "        \n\t"\
41
            "movhlps "a", "b"           \n\t"\
30 42
            PMAXW(b, a)\
31
            "pshufw $0x01," #a ", " #b "        \n\t"\
43
            "pshuflw $0x0E, "a", "b"    \n\t"\
44
            PMAXW(b, a)\
45
            "pshuflw $0x01, "a", "b"    \n\t"\
46
            PMAXW(b, a)
47
#else
48
#define MMREG_WIDTH "8"
49
#define MM "%%mm"
50
#define MOVQ "movq"
51
#if defined(HAVE_MMX2)
52
#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
53
#define PMAXW(a,b) "pmaxsw "a", "b"     \n\t"
54
#define PMAX(a,b) \
55
            "pshufw $0x0E, "a", "b"     \n\t"\
56
            PMAXW(b, a)\
57
            "pshufw $0x01, "a", "b"     \n\t"\
32 58
            PMAXW(b, a)
33 59
#else
34 60
#define SPREADW(a) \
35
        "punpcklwd " #a ", " #a " \n\t"\
36
        "punpcklwd " #a ", " #a " \n\t"
61
            "punpcklwd "a", "a"         \n\t"\
62
            "punpcklwd "a", "a"         \n\t"
37 63
#define PMAXW(a,b) \
38
        "psubusw " #a ", " #b " \n\t"\
39
        "paddw " #a ", " #b "   \n\t"
64
            "psubusw "a", "b"           \n\t"\
65
            "paddw "a", "b"             \n\t"
40 66
#define PMAX(a,b)  \
41
            "movq " #a ", " #b "                \n\t"\
42
            "psrlq $32, " #a "                  \n\t"\
67
            "movq "a", "b"              \n\t"\
68
            "psrlq $32, "a"             \n\t"\
43 69
            PMAXW(b, a)\
44
            "movq " #a ", " #b "                \n\t"\
45
            "psrlq $16, " #a "                  \n\t"\
70
            "movq "a", "b"              \n\t"\
71
            "psrlq $16, "a"             \n\t"\
46 72
            PMAXW(b, a)
47 73

  
48 74
#endif
75
#endif
76

  
77
#ifdef HAVE_SSSE3
78
#define SAVE_SIGN(a,b) \
79
            "movdqa "b", "a"            \n\t"\
80
            "pabsw  "b", "b"            \n\t"
81
#define RESTORE_SIGN(a,b) \
82
            "psignw "a", "b"            \n\t"
83
#else
84
#define SAVE_SIGN(a,b) \
85
            "pxor "a", "a"              \n\t"\
86
            "pcmpgtw "b", "a"           \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
87
            "pxor "a", "b"              \n\t"\
88
            "psubw "a", "b"             \n\t" /* ABS(block[i]) */
89
#define RESTORE_SIGN(a,b) \
90
            "pxor "a", "b"              \n\t"\
91
            "psubw "a", "b"             \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
92
#endif
49 93

  
50 94
static int RENAME(dct_quantize)(MpegEncContext *s,
51 95
                            DCTELEM *block, int n,
......
54 98
    long last_non_zero_p1;
55 99
    int level=0, q; //=0 is cuz gcc says uninitalized ...
56 100
    const uint16_t *qmat, *bias;
57
    DECLARE_ALIGNED_8(int16_t, temp_block[64]);
101
    DECLARE_ALIGNED_16(int16_t, temp_block[64]);
58 102

  
59 103
    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
60 104

  
......
106 150
    if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
107 151

  
108 152
        asm volatile(
109
            "movd %%"REG_a", %%mm3              \n\t" // last_non_zero_p1
110
            SPREADW(%%mm3)
111
            "pxor %%mm7, %%mm7                  \n\t" // 0
112
            "pxor %%mm4, %%mm4                  \n\t" // 0
113
            "movq (%2), %%mm5                   \n\t" // qmat[0]
114
            "pxor %%mm6, %%mm6                  \n\t"
115
            "psubw (%3), %%mm6                  \n\t" // -bias[0]
153
            "movd %%"REG_a", "MM"3              \n\t" // last_non_zero_p1
154
            SPREADW(MM"3")
155
            "pxor "MM"7, "MM"7                  \n\t" // 0
156
            "pxor "MM"4, "MM"4                  \n\t" // 0
157
            MOVQ" (%2), "MM"5                   \n\t" // qmat[0]
158
            "pxor "MM"6, "MM"6                  \n\t"
159
            "psubw (%3), "MM"6                  \n\t" // -bias[0]
116 160
            "mov $-128, %%"REG_a"               \n\t"
117 161
            ASMALIGN(4)
118 162
            "1:                                 \n\t"
119
            "pxor %%mm1, %%mm1                  \n\t" // 0
120
            "movq (%1, %%"REG_a"), %%mm0        \n\t" // block[i]
121
            "pcmpgtw %%mm0, %%mm1               \n\t" // block[i] <= 0 ? 0xFF : 0x00
122
            "pxor %%mm1, %%mm0                  \n\t"
123
            "psubw %%mm1, %%mm0                 \n\t" // ABS(block[i])
124
            "psubusw %%mm6, %%mm0               \n\t" // ABS(block[i]) + bias[0]
125
            "pmulhw %%mm5, %%mm0                \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
126
            "por %%mm0, %%mm4                   \n\t"
127
            "pxor %%mm1, %%mm0                  \n\t"
128
            "psubw %%mm1, %%mm0                 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
129
            "movq %%mm0, (%5, %%"REG_a")        \n\t"
130
            "pcmpeqw %%mm7, %%mm0               \n\t" // out==0 ? 0xFF : 0x00
131
            "movq (%4, %%"REG_a"), %%mm1        \n\t"
132
            "movq %%mm7, (%1, %%"REG_a")        \n\t" // 0
133
            "pandn %%mm1, %%mm0                 \n\t"
134
            PMAXW(%%mm0, %%mm3)
135
            "add $8, %%"REG_a"                  \n\t"
163
            MOVQ" (%1, %%"REG_a"), "MM"0        \n\t" // block[i]
164
            SAVE_SIGN(MM"1", MM"0")                   // ABS(block[i])
165
            "psubusw "MM"6, "MM"0               \n\t" // ABS(block[i]) + bias[0]
166
            "pmulhw "MM"5, "MM"0                \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
167
            "por "MM"0, "MM"4                   \n\t"
168
            RESTORE_SIGN(MM"1", MM"0")                // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
169
            MOVQ" "MM"0, (%5, %%"REG_a")        \n\t"
170
            "pcmpeqw "MM"7, "MM"0               \n\t" // out==0 ? 0xFF : 0x00
171
            MOVQ" (%4, %%"REG_a"), "MM"1        \n\t"
172
            MOVQ" "MM"7, (%1, %%"REG_a")        \n\t" // 0
173
            "pandn "MM"1, "MM"0                 \n\t"
174
            PMAXW(MM"0", MM"3")
175
            "add $"MMREG_WIDTH", %%"REG_a"      \n\t"
136 176
            " js 1b                             \n\t"
137
            PMAX(%%mm3, %%mm0)
138
            "movd %%mm3, %%"REG_a"              \n\t"
177
            PMAX(MM"3", MM"0")
178
            "movd "MM"3, %%"REG_a"              \n\t"
139 179
            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
140 180
            : "+a" (last_non_zero_p1)
141 181
            : "r" (block+64), "r" (qmat), "r" (bias),
142 182
              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
143 183
        );
144
        // note the asm is split cuz gcc doesnt like that many operands ...
145
        asm volatile(
146
            "movd %1, %%mm1                     \n\t" // max_qcoeff
147
            SPREADW(%%mm1)
148
            "psubusw %%mm1, %%mm4               \n\t"
149
            "packuswb %%mm4, %%mm4              \n\t"
150
            "movd %%mm4, %0                     \n\t" // *overflow
151
        : "=g" (*overflow)
152
        : "g" (s->max_qcoeff)
153
        );
154 184
    }else{ // FMT_H263
155 185
        asm volatile(
156
            "movd %%"REG_a", %%mm3              \n\t" // last_non_zero_p1
157
            SPREADW(%%mm3)
158
            "pxor %%mm7, %%mm7                  \n\t" // 0
159
            "pxor %%mm4, %%mm4                  \n\t" // 0
186
            "movd %%"REG_a", "MM"3              \n\t" // last_non_zero_p1
187
            SPREADW(MM"3")
188
            "pxor "MM"7, "MM"7                  \n\t" // 0
189
            "pxor "MM"4, "MM"4                  \n\t" // 0
160 190
            "mov $-128, %%"REG_a"               \n\t"
161 191
            ASMALIGN(4)
162 192
            "1:                                 \n\t"
163
            "pxor %%mm1, %%mm1                  \n\t" // 0
164
            "movq (%1, %%"REG_a"), %%mm0        \n\t" // block[i]
165
            "pcmpgtw %%mm0, %%mm1               \n\t" // block[i] <= 0 ? 0xFF : 0x00
166
            "pxor %%mm1, %%mm0                  \n\t"
167
            "psubw %%mm1, %%mm0                 \n\t" // ABS(block[i])
168
            "movq (%3, %%"REG_a"), %%mm6        \n\t" // bias[0]
169
            "paddusw %%mm6, %%mm0               \n\t" // ABS(block[i]) + bias[0]
170
            "movq (%2, %%"REG_a"), %%mm5        \n\t" // qmat[i]
171
            "pmulhw %%mm5, %%mm0                \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
172
            "por %%mm0, %%mm4                   \n\t"
173
            "pxor %%mm1, %%mm0                  \n\t"
174
            "psubw %%mm1, %%mm0                 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
175
            "movq %%mm0, (%5, %%"REG_a")        \n\t"
176
            "pcmpeqw %%mm7, %%mm0               \n\t" // out==0 ? 0xFF : 0x00
177
            "movq (%4, %%"REG_a"), %%mm1        \n\t"
178
            "movq %%mm7, (%1, %%"REG_a")        \n\t" // 0
179
            "pandn %%mm1, %%mm0                 \n\t"
180
            PMAXW(%%mm0, %%mm3)
181
            "add $8, %%"REG_a"                  \n\t"
193
            MOVQ" (%1, %%"REG_a"), "MM"0        \n\t" // block[i]
194
            SAVE_SIGN(MM"1", MM"0")                   // ABS(block[i])
195
            MOVQ" (%3, %%"REG_a"), "MM"6        \n\t" // bias[0]
196
            "paddusw "MM"6, "MM"0               \n\t" // ABS(block[i]) + bias[0]
197
            MOVQ" (%2, %%"REG_a"), "MM"5        \n\t" // qmat[i]
198
            "pmulhw "MM"5, "MM"0                \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
199
            "por "MM"0, "MM"4                   \n\t"
200
            RESTORE_SIGN(MM"1", MM"0")                // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
201
            MOVQ" "MM"0, (%5, %%"REG_a")        \n\t"
202
            "pcmpeqw "MM"7, "MM"0               \n\t" // out==0 ? 0xFF : 0x00
203
            MOVQ" (%4, %%"REG_a"), "MM"1        \n\t"
204
            MOVQ" "MM"7, (%1, %%"REG_a")        \n\t" // 0
205
            "pandn "MM"1, "MM"0                 \n\t"
206
            PMAXW(MM"0", MM"3")
207
            "add $"MMREG_WIDTH", %%"REG_a"      \n\t"
182 208
            " js 1b                             \n\t"
183
            PMAX(%%mm3, %%mm0)
184
            "movd %%mm3, %%"REG_a"              \n\t"
209
            PMAX(MM"3", MM"0")
210
            "movd "MM"3, %%"REG_a"              \n\t"
185 211
            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
186 212
            : "+a" (last_non_zero_p1)
187 213
            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
188 214
              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
189 215
        );
190
        // note the asm is split cuz gcc doesnt like that many operands ...
191
        asm volatile(
192
            "movd %1, %%mm1                     \n\t" // max_qcoeff
193
            SPREADW(%%mm1)
194
            "psubusw %%mm1, %%mm4               \n\t"
195
            "packuswb %%mm4, %%mm4              \n\t"
196
            "movd %%mm4, %0                     \n\t" // *overflow
216
    }
217
    asm volatile(
218
        "movd %1, "MM"1                     \n\t" // max_qcoeff
219
        SPREADW(MM"1")
220
        "psubusw "MM"1, "MM"4               \n\t"
221
        "packuswb "MM"4, "MM"4              \n\t"
222
#ifdef HAVE_SSE2
223
        "packuswb "MM"4, "MM"4              \n\t"
224
#endif
225
        "movd "MM"4, %0                     \n\t" // *overflow
197 226
        : "=g" (*overflow)
198 227
        : "g" (s->max_qcoeff)
199
        );
200
    }
228
    );
201 229

  
202 230
    if(s->mb_intra) block[0]= level;
203 231
    else            block[0]= temp_block[0];

Also available in: Unified diff