Revision ff506a90 libavcodec/i386/mpegvideo_mmx_template.c
libavcodec/i386/mpegvideo_mmx_template.c  

19  19 
* License along with FFmpeg; if not, write to the Free Software 
20  20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA 
21  21 
*/ 
22  
23 
#undef MMREG_WIDTH 

24 
#undef MM 

25 
#undef MOVQ 

22  26 
#undef SPREADW 
23  27 
#undef PMAXW 
24  28 
#undef PMAX 
25 
#ifdef HAVE_MMX2 

26 
#define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t" 

27 
#define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t" 

29 
#undef SAVE_SIGN 

30 
#undef RESTORE_SIGN 

31  
32 
#if defined(HAVE_SSE2) 

33 
#define MMREG_WIDTH "16" 

34 
#define MM "%%xmm" 

35 
#define MOVQ "movdqa" 

36 
#define SPREADW(a) \ 

37 
"pshuflw $0, "a", "a" \n\t"\ 

38 
"punpcklwd "a", "a" \n\t" 

39 
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" 

28  40 
#define PMAX(a,b) \ 
29 
"pshufw $0x0E," #a ", " #b " \n\t"\


41 
"movhlps "a", "b" \n\t"\


30  42 
PMAXW(b, a)\ 
31 
"pshufw $0x01," #a ", " #b " \n\t"\ 

43 
"pshuflw $0x0E, "a", "b" \n\t"\ 

44 
PMAXW(b, a)\ 

45 
"pshuflw $0x01, "a", "b" \n\t"\ 

46 
PMAXW(b, a) 

47 
#else 

48 
#define MMREG_WIDTH "8" 

49 
#define MM "%%mm" 

50 
#define MOVQ "movq" 

51 
#if defined(HAVE_MMX2) 

52 
#define SPREADW(a) "pshufw $0, "a", "a" \n\t" 

53 
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" 

54 
#define PMAX(a,b) \ 

55 
"pshufw $0x0E, "a", "b" \n\t"\ 

56 
PMAXW(b, a)\ 

57 
"pshufw $0x01, "a", "b" \n\t"\ 

32  58 
PMAXW(b, a) 
33  59 
#else 
34  60 
#define SPREADW(a) \ 
35 
"punpcklwd " #a ", " #a " \n\t"\


36 
"punpcklwd " #a ", " #a " \n\t"


61 
"punpcklwd "a", "a" \n\t"\


62 
"punpcklwd "a", "a" \n\t"


37  63 
#define PMAXW(a,b) \ 
38 
"psubusw " #a ", " #b " \n\t"\


39 
"paddw " #a ", " #b " \n\t"


64 
"psubusw "a", "b" \n\t"\


65 
"paddw "a", "b" \n\t"


40  66 
#define PMAX(a,b) \ 
41 
"movq " #a ", " #b " \n\t"\


42 
"psrlq $32, " #a " \n\t"\


67 
"movq "a", "b" \n\t"\


68 
"psrlq $32, "a" \n\t"\


43  69 
PMAXW(b, a)\ 
44 
"movq " #a ", " #b " \n\t"\


45 
"psrlq $16, " #a " \n\t"\


70 
"movq "a", "b" \n\t"\


71 
"psrlq $16, "a" \n\t"\


46  72 
PMAXW(b, a) 
47  73  
48  74 
#endif 
75 
#endif 

76  
77 
#ifdef HAVE_SSSE3 

78 
#define SAVE_SIGN(a,b) \ 

79 
"movdqa "b", "a" \n\t"\ 

80 
"pabsw "b", "b" \n\t" 

81 
#define RESTORE_SIGN(a,b) \ 

82 
"psignw "a", "b" \n\t" 

83 
#else 

84 
#define SAVE_SIGN(a,b) \ 

85 
"pxor "a", "a" \n\t"\ 

86 
"pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\ 

87 
"pxor "a", "b" \n\t"\ 

88 
"psubw "a", "b" \n\t" /* ABS(block[i]) */ 

89 
#define RESTORE_SIGN(a,b) \ 

90 
"pxor "a", "b" \n\t"\ 

91 
"psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0]  bias[0]*qmat[0])>>16)*sign(block[i]) 

92 
#endif 

49  93  
50  94 
static int RENAME(dct_quantize)(MpegEncContext *s, 
51  95 
DCTELEM *block, int n, 
...  ...  
54  98 
long last_non_zero_p1; 
55  99 
int level=0, q; //=0 is cuz gcc says uninitalized ... 
56  100 
const uint16_t *qmat, *bias; 
57 
DECLARE_ALIGNED_8(int16_t, temp_block[64]);


101 
DECLARE_ALIGNED_16(int16_t, temp_block[64]);


58  102  
59  103 
assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? 
60  104  
...  ...  
106  150 
if((s>out_format == FMT_H263  s>out_format == FMT_H261) && s>mpeg_quant==0){ 
107  151  
108  152 
asm volatile( 
109 
"movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1


110 
SPREADW(%%mm3)


111 
"pxor %%mm7, %%mm7 \n\t" // 0


112 
"pxor %%mm4, %%mm4 \n\t" // 0


113 
"movq (%2), %%mm5 \n\t" // qmat[0]


114 
"pxor %%mm6, %%mm6 \n\t"


115 
"psubw (%3), %%mm6 \n\t" // bias[0]


153 
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1


154 
SPREADW(MM"3")


155 
"pxor "MM"7, "MM"7 \n\t" // 0


156 
"pxor "MM"4, "MM"4 \n\t" // 0


157 
MOVQ" (%2), "MM"5 \n\t" // qmat[0]


158 
"pxor "MM"6, "MM"6 \n\t"


159 
"psubw (%3), "MM"6 \n\t" // bias[0]


116  160 
"mov $128, %%"REG_a" \n\t" 
117  161 
ASMALIGN(4) 
118  162 
"1: \n\t" 
119 
"pxor %%mm1, %%mm1 \n\t" // 0 

120 
"movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] 

121 
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 

122 
"pxor %%mm1, %%mm0 \n\t" 

123 
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) 

124 
"psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] 

125 
"pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0]  bias[0]*qmat[0])>>16 

126 
"por %%mm0, %%mm4 \n\t" 

127 
"pxor %%mm1, %%mm0 \n\t" 

128 
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0]  bias[0]*qmat[0])>>16)*sign(block[i]) 

129 
"movq %%mm0, (%5, %%"REG_a") \n\t" 

130 
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 

131 
"movq (%4, %%"REG_a"), %%mm1 \n\t" 

132 
"movq %%mm7, (%1, %%"REG_a") \n\t" // 0 

133 
"pandn %%mm1, %%mm0 \n\t" 

134 
PMAXW(%%mm0, %%mm3) 

135 
"add $8, %%"REG_a" \n\t" 

163 
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] 

164 
SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) 

165 
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] 

166 
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0]  bias[0]*qmat[0])>>16 

167 
"por "MM"0, "MM"4 \n\t" 

168 
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0]  bias[0]*qmat[0])>>16)*sign(block[i]) 

169 
MOVQ" "MM"0, (%5, %%"REG_a") \n\t" 

170 
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 

171 
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" 

172 
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 

173 
"pandn "MM"1, "MM"0 \n\t" 

174 
PMAXW(MM"0", MM"3") 

175 
"add $"MMREG_WIDTH", %%"REG_a" \n\t" 

136  176 
" js 1b \n\t" 
137 
PMAX(%%mm3, %%mm0)


138 
"movd %%mm3, %%"REG_a" \n\t"


177 
PMAX(MM"3", MM"0")


178 
"movd "MM"3, %%"REG_a" \n\t"


139  179 
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 
140  180 
: "+a" (last_non_zero_p1) 
141  181 
: "r" (block+64), "r" (qmat), "r" (bias), 
142  182 
"r" (inv_zigzag_direct16+64), "r" (temp_block+64) 
143  183 
); 
144 
// note the asm is split cuz gcc doesnt like that many operands ... 

145 
asm volatile( 

146 
"movd %1, %%mm1 \n\t" // max_qcoeff 

147 
SPREADW(%%mm1) 

148 
"psubusw %%mm1, %%mm4 \n\t" 

149 
"packuswb %%mm4, %%mm4 \n\t" 

150 
"movd %%mm4, %0 \n\t" // *overflow 

151 
: "=g" (*overflow) 

152 
: "g" (s>max_qcoeff) 

153 
); 

154  184 
}else{ // FMT_H263 
155  185 
asm volatile( 
156 
"movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1


157 
SPREADW(%%mm3)


158 
"pxor %%mm7, %%mm7 \n\t" // 0


159 
"pxor %%mm4, %%mm4 \n\t" // 0


186 
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1


187 
SPREADW(MM"3")


188 
"pxor "MM"7, "MM"7 \n\t" // 0


189 
"pxor "MM"4, "MM"4 \n\t" // 0


160  190 
"mov $128, %%"REG_a" \n\t" 
161  191 
ASMALIGN(4) 
162  192 
"1: \n\t" 
163 
"pxor %%mm1, %%mm1 \n\t" // 0 

164 
"movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] 

165 
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 

166 
"pxor %%mm1, %%mm0 \n\t" 

167 
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) 

168 
"movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0] 

169 
"paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] 

170 
"movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] 

171 
"pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 

172 
"por %%mm0, %%mm4 \n\t" 

173 
"pxor %%mm1, %%mm0 \n\t" 

174 
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0]  bias[0]*qmat[0])>>16)*sign(block[i]) 

175 
"movq %%mm0, (%5, %%"REG_a") \n\t" 

176 
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 

177 
"movq (%4, %%"REG_a"), %%mm1 \n\t" 

178 
"movq %%mm7, (%1, %%"REG_a") \n\t" // 0 

179 
"pandn %%mm1, %%mm0 \n\t" 

180 
PMAXW(%%mm0, %%mm3) 

181 
"add $8, %%"REG_a" \n\t" 

193 
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] 

194 
SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) 

195 
MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0] 

196 
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] 

197 
MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i] 

198 
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 

199 
"por "MM"0, "MM"4 \n\t" 

200 
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0]  bias[0]*qmat[0])>>16)*sign(block[i]) 

201 
MOVQ" "MM"0, (%5, %%"REG_a") \n\t" 

202 
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 

203 
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" 

204 
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 

205 
"pandn "MM"1, "MM"0 \n\t" 

206 
PMAXW(MM"0", MM"3") 

207 
"add $"MMREG_WIDTH", %%"REG_a" \n\t" 

182  208 
" js 1b \n\t" 
183 
PMAX(%%mm3, %%mm0)


184 
"movd %%mm3, %%"REG_a" \n\t"


209 
PMAX(MM"3", MM"0")


210 
"movd "MM"3, %%"REG_a" \n\t"


185  211 
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 
186  212 
: "+a" (last_non_zero_p1) 
187  213 
: "r" (block+64), "r" (qmat+64), "r" (bias+64), 
188  214 
"r" (inv_zigzag_direct16+64), "r" (temp_block+64) 
189  215 
); 
190 
// note the asm is split cuz gcc doesnt like that many operands ... 

191 
asm volatile( 

192 
"movd %1, %%mm1 \n\t" // max_qcoeff 

193 
SPREADW(%%mm1) 

194 
"psubusw %%mm1, %%mm4 \n\t" 

195 
"packuswb %%mm4, %%mm4 \n\t" 

196 
"movd %%mm4, %0 \n\t" // *overflow 

216 
} 

217 
asm volatile( 

218 
"movd %1, "MM"1 \n\t" // max_qcoeff 

219 
SPREADW(MM"1") 

220 
"psubusw "MM"1, "MM"4 \n\t" 

221 
"packuswb "MM"4, "MM"4 \n\t" 

222 
#ifdef HAVE_SSE2 

223 
"packuswb "MM"4, "MM"4 \n\t" 

224 
#endif 

225 
"movd "MM"4, %0 \n\t" // *overflow 

197  226 
: "=g" (*overflow) 
198  227 
: "g" (s>max_qcoeff) 
199 
); 

200 
} 

228 
); 

201  229  
202  230 
if(s>mb_intra) block[0]= level; 
203  231 
else block[0]= temp_block[0]; 
Also available in: Unified diff