Revision 164d75eb
libavcodec/i386/motion_est_mmx.c  

70  70  
71  71 
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
72  72 
{ 
73 
long len= (stride*h); 

74  73 
asm volatile( 
75  74 
ASMALIGN(4) 
76  75 
"1: \n\t" 
77 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 

78 
"psadbw (%2, %%"REG_a"), %%mm0 \n\t" 

79 
"add %3, %%"REG_a" \n\t" 

80 
"movq (%1, %%"REG_a"), %%mm1 \n\t" 

81 
"psadbw (%2, %%"REG_a"), %%mm1 \n\t" 

82 
"paddw %%mm1, %%mm0 \n\t" 

76 
"movq (%1), %%mm0 \n\t" 

77 
"movq (%1, %3), %%mm1 \n\t" 

78 
"psadbw (%2), %%mm0 \n\t" 

79 
"psadbw (%2, %3), %%mm1 \n\t" 

83  80 
"paddw %%mm0, %%mm6 \n\t" 
84 
"add %3, %%"REG_a" \n\t" 

85 
" js 1b \n\t" 

86 
: "+a" (len) 

87 
: "r" (blk1  len), "r" (blk2  len), "r" ((long)stride) 

81 
"paddw %%mm1, %%mm6 \n\t" 

82 
"lea (%1,%3,2), %1 \n\t" 

83 
"lea (%2,%3,2), %2 \n\t" 

84 
"sub $2, %0 \n\t" 

85 
" jg 1b \n\t" 

86 
: "+r" (h), "+r" (blk1), "+r" (blk2) 

87 
: "r" ((long)stride) 

88  88 
); 
89  89 
} 
90  90  
91 
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)


91 
static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)


92  92 
{ 
93 
long len= (stride*h); 

94  93 
asm volatile( 
95  94 
ASMALIGN(4) 
96  95 
"1: \n\t" 
97 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 

98 
"pavgb (%2, %%"REG_a"), %%mm0 \n\t" 

99 
"psadbw (%3, %%"REG_a"), %%mm0 \n\t" 

100 
"add %4, %%"REG_a" \n\t" 

101 
"movq (%1, %%"REG_a"), %%mm1 \n\t" 

102 
"pavgb (%2, %%"REG_a"), %%mm1 \n\t" 

103 
"psadbw (%3, %%"REG_a"), %%mm1 \n\t" 

104 
"paddw %%mm1, %%mm0 \n\t" 

96 
"movq (%1), %%mm0 \n\t" 

97 
"movq (%1, %3), %%mm1 \n\t" 

98 
"pavgb 1(%1), %%mm0 \n\t" 

99 
"pavgb 1(%1, %3), %%mm1 \n\t" 

100 
"psadbw (%2), %%mm0 \n\t" 

101 
"psadbw (%2, %3), %%mm1 \n\t" 

105  102 
"paddw %%mm0, %%mm6 \n\t" 
106 
"add %4, %%"REG_a" \n\t" 

107 
" js 1b \n\t" 

108 
: "+a" (len) 

109 
: "r" (blk1a  len), "r" (blk1b len), "r" (blk2  len), "r" ((long)stride) 

103 
"paddw %%mm1, %%mm6 \n\t" 

104 
"lea (%1,%3,2), %1 \n\t" 

105 
"lea (%2,%3,2), %2 \n\t" 

106 
"sub $2, %0 \n\t" 

107 
" jg 1b \n\t" 

108 
: "+r" (h), "+r" (blk1), "+r" (blk2) 

109 
: "r" ((long)stride) 

110 
); 

111 
} 

112  
113 
static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 

114 
{ 

115 
asm volatile( 

116 
"movq (%1), %%mm0 \n\t" 

117 
"add %3, %1 \n\t" 

118 
ASMALIGN(4) 

119 
"1: \n\t" 

120 
"movq (%1), %%mm1 \n\t" 

121 
"movq (%1, %3), %%mm2 \n\t" 

122 
"pavgb %%mm1, %%mm0 \n\t" 

123 
"pavgb %%mm2, %%mm1 \n\t" 

124 
"psadbw (%2), %%mm0 \n\t" 

125 
"psadbw (%2, %3), %%mm1 \n\t" 

126 
"paddw %%mm0, %%mm6 \n\t" 

127 
"paddw %%mm1, %%mm6 \n\t" 

128 
"movq %%mm2, %%mm0 \n\t" 

129 
"lea (%1,%3,2), %1 \n\t" 

130 
"lea (%2,%3,2), %2 \n\t" 

131 
"sub $2, %0 \n\t" 

132 
" jg 1b \n\t" 

133 
: "+r" (h), "+r" (blk1), "+r" (blk2) 

134 
: "r" ((long)stride) 

110  135 
); 
111  136 
} 
112  137  
113  138 
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 
114 
{ //FIXME reuse src 

115 
long len= (stride*h); 

139 
{ 

116  140 
asm volatile( 
117  141 
"movq "MANGLE(bone)", %%mm5 \n\t" 
142 
"movq (%1), %%mm0 \n\t" 

143 
"pavgb 1(%1), %%mm0 \n\t" 

144 
"add %3, %1 \n\t" 

118  145 
ASMALIGN(4) 
119  146 
"1: \n\t" 
120 
"movq (%1, %%"REG_a"), %%mm0 \n\t"


121 
"movq 1(%1, %%"REG_a"), %%mm1 \n\t"


122 
"pavgb (%2, %%"REG_a"), %%mm0 \n\t"


123 
"pavgb 1(%2, %%"REG_a"), %%mm1 \n\t"


147 
"movq (%1), %%mm1 \n\t"


148 
"movq (%1,%3), %%mm2 \n\t"


149 
"pavgb 1(%1), %%mm1 \n\t"


150 
"pavgb 1(%1,%3), %%mm2 \n\t"


124  151 
"psubusb %%mm5, %%mm1 \n\t" 
125  152 
"pavgb %%mm1, %%mm0 \n\t" 
126 
"psadbw (%3, %%"REG_a"), %%mm0 \n\t" 

127 
"add %4, %%"REG_a" \n\t" 

128 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 

129 
"movq 1(%1, %%"REG_a"), %%mm3 \n\t" 

130 
"pavgb (%2, %%"REG_a"), %%mm2 \n\t" 

131 
"pavgb 1(%2, %%"REG_a"), %%mm3 \n\t" 

132 
"psubusb %%mm5, %%mm3 \n\t" 

133 
"pavgb %%mm3, %%mm2 \n\t" 

134 
"psadbw (%3, %%"REG_a"), %%mm2 \n\t" 

135 
"paddw %%mm2, %%mm0 \n\t" 

153 
"pavgb %%mm2, %%mm1 \n\t" 

154 
"psadbw (%2), %%mm0 \n\t" 

155 
"psadbw (%2,%3), %%mm1 \n\t" 

136  156 
"paddw %%mm0, %%mm6 \n\t" 
137 
"add %4, %%"REG_a" \n\t" 

138 
" js 1b \n\t" 

139 
: "+a" (len) 

140 
: "r" (blk1  len), "r" (blk1  len + stride), "r" (blk2  len), "r" ((long)stride) 

157 
"paddw %%mm1, %%mm6 \n\t" 

158 
"movq %%mm2, %%mm0 \n\t" 

159 
"lea (%1,%3,2), %1 \n\t" 

160 
"lea (%2,%3,2), %2 \n\t" 

161 
"sub $2, %0 \n\t" 

162 
" jg 1b \n\t" 

163 
: "+r" (h), "+r" (blk1), "+r" (blk2) 

164 
: "r" ((long)stride) 

141  165 
); 
142  166 
} 
143  167  
...  ...  
183  207 
{ 
184  208 
long len= (stride*h); 
185  209 
asm volatile( 
186 
ASMALIGN(4) 

187 
"1: \n\t" 

188  210 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
189 
"movq (%2, %%"REG_a"), %%mm1 \n\t" 

190 
"movq %%mm0, %%mm4 \n\t" 

191 
"movq %%mm1, %%mm2 \n\t" 

192 
"punpcklbw %%mm7, %%mm0 \n\t" 

193 
"punpcklbw %%mm7, %%mm1 \n\t" 

194 
"punpckhbw %%mm7, %%mm4 \n\t" 

195 
"punpckhbw %%mm7, %%mm2 \n\t" 

196 
"paddw %%mm1, %%mm0 \n\t" 

197 
"paddw %%mm2, %%mm4 \n\t" 

198  211 
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" 
199 
"movq 1(%2, %%"REG_a"), %%mm3 \n\t"


200 
"movq %%mm2, %%mm1 \n\t"


201 
"punpcklbw %%mm7, %%mm2 \n\t"


212 
"movq %%mm0, %%mm1 \n\t"


213 
"movq %%mm2, %%mm3 \n\t"


214 
"punpcklbw %%mm7, %%mm0 \n\t"


202  215 
"punpckhbw %%mm7, %%mm1 \n\t" 
203 
"paddw %%mm0, %%mm2 \n\t" 

204 
"paddw %%mm4, %%mm1 \n\t" 

205 
"movq %%mm3, %%mm4 \n\t" 

206 
"punpcklbw %%mm7, %%mm3 \n\t" 

207 
"punpckhbw %%mm7, %%mm4 \n\t" 

208 
"paddw %%mm3, %%mm2 \n\t" 

209 
"paddw %%mm4, %%mm1 \n\t" 

210 
"movq (%3, %%"REG_a"), %%mm3 \n\t" 

211 
"movq (%3, %%"REG_a"), %%mm4 \n\t" 

212 
"paddw %%mm5, %%mm2 \n\t" 

216 
"punpcklbw %%mm7, %%mm2 \n\t" 

217 
"punpckhbw %%mm7, %%mm3 \n\t" 

218 
"paddw %%mm2, %%mm0 \n\t" 

219 
"paddw %%mm3, %%mm1 \n\t" 

220 
ASMALIGN(4) 

221 
"1: \n\t" 

222 
"movq (%2, %%"REG_a"), %%mm2 \n\t" 

223 
"movq 1(%2, %%"REG_a"), %%mm4 \n\t" 

224 
"movq %%mm2, %%mm3 \n\t" 

225 
"movq %%mm4, %%mm5 \n\t" 

226 
"punpcklbw %%mm7, %%mm2 \n\t" 

227 
"punpckhbw %%mm7, %%mm3 \n\t" 

228 
"punpcklbw %%mm7, %%mm4 \n\t" 

229 
"punpckhbw %%mm7, %%mm5 \n\t" 

230 
"paddw %%mm4, %%mm2 \n\t" 

231 
"paddw %%mm5, %%mm3 \n\t" 

232 
"movq 16+"MANGLE(round_tab)", %%mm5 \n\t" 

233 
"paddw %%mm2, %%mm0 \n\t" 

234 
"paddw %%mm3, %%mm1 \n\t" 

235 
"paddw %%mm5, %%mm0 \n\t" 

213  236 
"paddw %%mm5, %%mm1 \n\t" 
214 
"psrlw $2, %%mm2 \n\t" 

237 
"movq (%3, %%"REG_a"), %%mm4 \n\t" 

238 
"movq (%3, %%"REG_a"), %%mm5 \n\t" 

239 
"psrlw $2, %%mm0 \n\t" 

215  240 
"psrlw $2, %%mm1 \n\t" 
216 
"packuswb %%mm1, %%mm2 \n\t"


217 
"psubusb %%mm2, %%mm3 \n\t"


218 
"psubusb %%mm4, %%mm2 \n\t"


219 
"por %%mm3, %%mm2 \n\t"


220 
"movq %%mm2, %%mm0 \n\t"


241 
"packuswb %%mm1, %%mm0 \n\t"


242 
"psubusb %%mm0, %%mm4 \n\t"


243 
"psubusb %%mm5, %%mm0 \n\t"


244 
"por %%mm4, %%mm0 \n\t"


245 
"movq %%mm0, %%mm4 \n\t"


221  246 
"punpcklbw %%mm7, %%mm0 \n\t" 
222 
"punpckhbw %%mm7, %%mm2 \n\t" 

223 
"paddw %%mm2, %%mm0 \n\t" 

247 
"punpckhbw %%mm7, %%mm4 \n\t" 

224  248 
"paddw %%mm0, %%mm6 \n\t" 
249 
"paddw %%mm4, %%mm6 \n\t" 

250 
"movq %%mm2, %%mm0 \n\t" 

251 
"movq %%mm3, %%mm1 \n\t" 

225  252 
"add %4, %%"REG_a" \n\t" 
226  253 
" js 1b \n\t" 
227  254 
: "+a" (len) 
...  ...  
255  282 
return ret; 
256  283 
} 
257  284  
285 
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 

286 
{ 

287 
sad8_2_mmx(blk1, blk1+1, blk2, stride, h); 

288 
} 

289 
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 

290 
{ 

291 
sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); 

292 
} 

293  
258  294  
259  295 
#define PIX_SAD(suf)\ 
260  296 
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 
...  ...  
272  308 
assert(h==8);\ 
273  309 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
274  310 
"pxor %%mm6, %%mm6 \n\t"\ 
275 
"movq %0, %%mm5 \n\t"\ 

276  311 
:: "m"(round_tab[1]) \ 
277  312 
);\ 
278  313 
\ 
279 
sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\


314 
sad8_x2a_ ## suf(blk1, blk2, stride, 8);\


280  315 
\ 
281  316 
return sum_ ## suf();\ 
282  317 
}\ 
...  ...  
290  325 
:: "m"(round_tab[1]) \ 
291  326 
);\ 
292  327 
\ 
293 
sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\


328 
sad8_y2a_ ## suf(blk1, blk2, stride, 8);\


294  329 
\ 
295  330 
return sum_ ## suf();\ 
296  331 
}\ 
...  ...  
300  335 
assert(h==8);\ 
301  336 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
302  337 
"pxor %%mm6, %%mm6 \n\t"\ 
303 
"movq %0, %%mm5 \n\t"\ 

304 
:: "m"(round_tab[2]) \ 

305 
);\ 

338 
::);\ 

306  339 
\ 
307  340 
sad8_4_ ## suf(blk1, blk2, stride, 8);\ 
308  341 
\ 
...  ...  
327  360 
:: "m"(round_tab[1]) \ 
328  361 
);\ 
329  362 
\ 
330 
sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, h);\


331 
sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\


363 
sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\


364 
sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\


332  365 
\ 
333  366 
return sum_ ## suf();\ 
334  367 
}\ 
...  ...  
340  373 
:: "m"(round_tab[1]) \ 
341  374 
);\ 
342  375 
\ 
343 
sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, h);\


344 
sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\


376 
sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\


377 
sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\


345  378 
\ 
346  379 
return sum_ ## suf();\ 
347  380 
}\ 
...  ...  
349  382 
{\ 
350  383 
asm volatile("pxor %%mm7, %%mm7 \n\t"\ 
351  384 
"pxor %%mm6, %%mm6 \n\t"\ 
352 
"movq %0, %%mm5 \n\t"\ 

353 
:: "m"(round_tab[2]) \ 

354 
);\ 

385 
::);\ 

355  386 
\ 
356  387 
sad8_4_ ## suf(blk1 , blk2 , stride, h);\ 
357  388 
sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ 
Also available in: Unified diff