Revision 16e0bf73 libpostproc/postprocess_template.c
libpostproc/postprocess_template.c  

42  42 
#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" 
43  43 
#elif defined (HAVE_MMX) 
44  44 
#define PMINUB(b,a,t) \ 
45 
"movq " #a ", " #t " \n\t"\


46 
"psubusb " #b ", " #t " \n\t"\


47 
"psubb " #t ", " #a " \n\t"


45 
"movq " #a ", " #t " \n\t"\ 

46 
"psubusb " #b ", " #t " \n\t"\ 

47 
"psubb " #t ", " #a " \n\t" 

48  48 
#endif 
49  49  
50  50 
#ifdef HAVE_MMX2 
51  51 
#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" 
52  52 
#elif defined (HAVE_MMX) 
53  53 
#define PMAXUB(a,b) \ 
54 
"psubusb " #a ", " #b " \n\t"\


55 
"paddb " #a ", " #b " \n\t"


54 
"psubusb " #a ", " #b " \n\t"\ 

55 
"paddb " #a ", " #b " \n\t" 

56  56 
#endif 
57  57  
58  58 
//FIXME? 2550 = 1 (should not be a problem ...) 
...  ...  
61  61 
* Check if the middle 8x8 Block in the given 8x16 block is flat 
62  62 
*/ 
63  63 
static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ 
64 
int numEq= 0, dcOk;


65 
src+= stride*4; // src points to begin of the 8x8 Block


66 
asm volatile( 

67 
"movq %0, %%mm7 \n\t"


68 
"movq %1, %%mm6 \n\t"


69 
: : "m" (c>mmxDcOffset[c>nonBQP]), "m" (c>mmxDcThreshold[c>nonBQP])


70 
);


71  
72 
asm volatile( 

73 
"lea (%2, %3), %%"REG_a" \n\t"


64 
int numEq= 0, dcOk; 

65 
src+= stride*4; // src points to begin of the 8x8 Block 

66 
asm volatile(


67 
"movq %0, %%mm7 \n\t" 

68 
"movq %1, %%mm6 \n\t" 

69 
: : "m" (c>mmxDcOffset[c>nonBQP]), "m" (c>mmxDcThreshold[c>nonBQP]) 

70 
); 

71  
72 
asm volatile(


73 
"lea (%2, %3), %%"REG_a" \n\t" 

74  74 
// 0 1 2 3 4 5 6 7 8 9 
75  75 
// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 
76  76  
77 
"movq (%2), %%mm0 \n\t"


78 
"movq (%%"REG_a"), %%mm1 \n\t"


79 
"movq %%mm0, %%mm3 \n\t"


80 
"movq %%mm0, %%mm4 \n\t"


81 
PMAXUB(%%mm1, %%mm4)


82 
PMINUB(%%mm1, %%mm3, %%mm5)


83 
"psubb %%mm1, %%mm0 \n\t" // mm0 = differnece


84 
"paddb %%mm7, %%mm0 \n\t"


85 
"pcmpgtb %%mm6, %%mm0 \n\t"


86  
87 
"movq (%%"REG_a",%3), %%mm2 \n\t"


88 
PMAXUB(%%mm2, %%mm4)


89 
PMINUB(%%mm2, %%mm3, %%mm5)


90 
"psubb %%mm2, %%mm1 \n\t"


91 
"paddb %%mm7, %%mm1 \n\t"


92 
"pcmpgtb %%mm6, %%mm1 \n\t"


93 
"paddb %%mm1, %%mm0 \n\t"


94  
95 
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"


96 
PMAXUB(%%mm1, %%mm4)


97 
PMINUB(%%mm1, %%mm3, %%mm5)


98 
"psubb %%mm1, %%mm2 \n\t"


99 
"paddb %%mm7, %%mm2 \n\t"


100 
"pcmpgtb %%mm6, %%mm2 \n\t"


101 
"paddb %%mm2, %%mm0 \n\t"


102  
103 
"lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"


104  
105 
"movq (%2, %3, 4), %%mm2 \n\t"


106 
PMAXUB(%%mm2, %%mm4)


107 
PMINUB(%%mm2, %%mm3, %%mm5)


108 
"psubb %%mm2, %%mm1 \n\t"


109 
"paddb %%mm7, %%mm1 \n\t"


110 
"pcmpgtb %%mm6, %%mm1 \n\t"


111 
"paddb %%mm1, %%mm0 \n\t"


112  
113 
"movq (%%"REG_a"), %%mm1 \n\t"


114 
PMAXUB(%%mm1, %%mm4)


115 
PMINUB(%%mm1, %%mm3, %%mm5)


116 
"psubb %%mm1, %%mm2 \n\t"


117 
"paddb %%mm7, %%mm2 \n\t"


118 
"pcmpgtb %%mm6, %%mm2 \n\t"


119 
"paddb %%mm2, %%mm0 \n\t"


120  
121 
"movq (%%"REG_a", %3), %%mm2 \n\t"


122 
PMAXUB(%%mm2, %%mm4)


123 
PMINUB(%%mm2, %%mm3, %%mm5)


124 
"psubb %%mm2, %%mm1 \n\t"


125 
"paddb %%mm7, %%mm1 \n\t"


126 
"pcmpgtb %%mm6, %%mm1 \n\t"


127 
"paddb %%mm1, %%mm0 \n\t"


128  
129 
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"


130 
PMAXUB(%%mm1, %%mm4)


131 
PMINUB(%%mm1, %%mm3, %%mm5)


132 
"psubb %%mm1, %%mm2 \n\t"


133 
"paddb %%mm7, %%mm2 \n\t"


134 
"pcmpgtb %%mm6, %%mm2 \n\t"


135 
"paddb %%mm2, %%mm0 \n\t"


136 
"psubusb %%mm3, %%mm4 \n\t"


137  
138 
" \n\t"


77 
"movq (%2), %%mm0 \n\t" 

78 
"movq (%%"REG_a"), %%mm1 \n\t" 

79 
"movq %%mm0, %%mm3 \n\t" 

80 
"movq %%mm0, %%mm4 \n\t" 

81 
PMAXUB(%%mm1, %%mm4) 

82 
PMINUB(%%mm1, %%mm3, %%mm5) 

83 
"psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 

84 
"paddb %%mm7, %%mm0 \n\t" 

85 
"pcmpgtb %%mm6, %%mm0 \n\t" 

86  
87 
"movq (%%"REG_a",%3), %%mm2 \n\t" 

88 
PMAXUB(%%mm2, %%mm4) 

89 
PMINUB(%%mm2, %%mm3, %%mm5) 

90 
"psubb %%mm2, %%mm1 \n\t" 

91 
"paddb %%mm7, %%mm1 \n\t" 

92 
"pcmpgtb %%mm6, %%mm1 \n\t" 

93 
"paddb %%mm1, %%mm0 \n\t" 

94  
95 
"movq (%%"REG_a", %3, 2), %%mm1 \n\t" 

96 
PMAXUB(%%mm1, %%mm4) 

97 
PMINUB(%%mm1, %%mm3, %%mm5) 

98 
"psubb %%mm1, %%mm2 \n\t" 

99 
"paddb %%mm7, %%mm2 \n\t" 

100 
"pcmpgtb %%mm6, %%mm2 \n\t" 

101 
"paddb %%mm2, %%mm0 \n\t" 

102  
103 
"lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 

104  
105 
"movq (%2, %3, 4), %%mm2 \n\t" 

106 
PMAXUB(%%mm2, %%mm4) 

107 
PMINUB(%%mm2, %%mm3, %%mm5) 

108 
"psubb %%mm2, %%mm1 \n\t" 

109 
"paddb %%mm7, %%mm1 \n\t" 

110 
"pcmpgtb %%mm6, %%mm1 \n\t" 

111 
"paddb %%mm1, %%mm0 \n\t" 

112  
113 
"movq (%%"REG_a"), %%mm1 \n\t" 

114 
PMAXUB(%%mm1, %%mm4) 

115 
PMINUB(%%mm1, %%mm3, %%mm5) 

116 
"psubb %%mm1, %%mm2 \n\t" 

117 
"paddb %%mm7, %%mm2 \n\t" 

118 
"pcmpgtb %%mm6, %%mm2 \n\t" 

119 
"paddb %%mm2, %%mm0 \n\t" 

120  
121 
"movq (%%"REG_a", %3), %%mm2 \n\t" 

122 
PMAXUB(%%mm2, %%mm4) 

123 
PMINUB(%%mm2, %%mm3, %%mm5) 

124 
"psubb %%mm2, %%mm1 \n\t" 

125 
"paddb %%mm7, %%mm1 \n\t" 

126 
"pcmpgtb %%mm6, %%mm1 \n\t" 

127 
"paddb %%mm1, %%mm0 \n\t" 

128  
129 
"movq (%%"REG_a", %3, 2), %%mm1 \n\t" 

130 
PMAXUB(%%mm1, %%mm4) 

131 
PMINUB(%%mm1, %%mm3, %%mm5) 

132 
"psubb %%mm1, %%mm2 \n\t" 

133 
"paddb %%mm7, %%mm2 \n\t" 

134 
"pcmpgtb %%mm6, %%mm2 \n\t" 

135 
"paddb %%mm2, %%mm0 \n\t" 

136 
"psubusb %%mm3, %%mm4 \n\t" 

137  
138 
" \n\t" 

139  139 
#ifdef HAVE_MMX2 
140 
"pxor %%mm7, %%mm7 \n\t"


141 
"psadbw %%mm7, %%mm0 \n\t"


140 
"pxor %%mm7, %%mm7 \n\t" 

141 
"psadbw %%mm7, %%mm0 \n\t" 

142  142 
#else 
143 
"movq %%mm0, %%mm1 \n\t"


144 
"psrlw $8, %%mm0 \n\t"


145 
"paddb %%mm1, %%mm0 \n\t"


146 
"movq %%mm0, %%mm1 \n\t"


147 
"psrlq $16, %%mm0 \n\t"


148 
"paddb %%mm1, %%mm0 \n\t"


149 
"movq %%mm0, %%mm1 \n\t"


150 
"psrlq $32, %%mm0 \n\t"


151 
"paddb %%mm1, %%mm0 \n\t"


143 
"movq %%mm0, %%mm1 \n\t" 

144 
"psrlw $8, %%mm0 \n\t" 

145 
"paddb %%mm1, %%mm0 \n\t" 

146 
"movq %%mm0, %%mm1 \n\t" 

147 
"psrlq $16, %%mm0 \n\t" 

148 
"paddb %%mm1, %%mm0 \n\t" 

149 
"movq %%mm0, %%mm1 \n\t" 

150 
"psrlq $32, %%mm0 \n\t" 

151 
"paddb %%mm1, %%mm0 \n\t" 

152  152 
#endif 
153 
"movq %4, %%mm7 \n\t" // QP,..., QP


154 
"paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP


155 
"psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP > 0


156 
"packssdw %%mm4, %%mm4 \n\t"


157 
"movd %%mm0, %0 \n\t"


158 
"movd %%mm4, %1 \n\t"


159  
160 
: "=r" (numEq), "=r" (dcOk)


161 
: "r" (src), "r" ((long)stride), "m" (c>pQPb)


162 
: "%"REG_a


163 
);


164  
165 
numEq= (numEq) &0xFF;


166 
if(numEq > c>ppMode.flatnessThreshold){


167 
if(dcOk) return 0;


168 
else return 1;


169 
}else{


170 
return 2;


171 
}


153 
"movq %4, %%mm7 \n\t" // QP,..., QP 

154 
"paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 

155 
"psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP > 0 

156 
"packssdw %%mm4, %%mm4 \n\t" 

157 
"movd %%mm0, %0 \n\t" 

158 
"movd %%mm4, %1 \n\t" 

159  
160 
: "=r" (numEq), "=r" (dcOk) 

161 
: "r" (src), "r" ((long)stride), "m" (c>pQPb) 

162 
: "%"REG_a 

163 
); 

164  
165 
numEq= (numEq) &0xFF; 

166 
if(numEq > c>ppMode.flatnessThreshold){ 

167 
if(dcOk) return 0; 

168 
else return 1; 

169 
}else{ 

170 
return 2; 

171 
} 

172  172 
} 
173  173 
#endif //HAVE_MMX 
174  174  
...  ...  
180  180 
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 
181  181 
{ 
182  182 
#if defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
183 
src+= stride*3;


184 
asm volatile( //"movv %0 %1 %2\n\t"


185 
"movq %2, %%mm0 \n\t" // QP,..., QP


186 
"pxor %%mm4, %%mm4 \n\t"


187  
188 
"movq (%0), %%mm6 \n\t"


189 
"movq (%0, %1), %%mm5 \n\t"


190 
"movq %%mm5, %%mm1 \n\t"


191 
"movq %%mm6, %%mm2 \n\t"


192 
"psubusb %%mm6, %%mm5 \n\t"


193 
"psubusb %%mm1, %%mm2 \n\t"


194 
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines


195 
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP > 0


196 
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP > FF


197  
198 
"pand %%mm2, %%mm6 \n\t"


199 
"pandn %%mm1, %%mm2 \n\t"


200 
"por %%mm2, %%mm6 \n\t"// First Line to Filter


201  
202 
"movq (%0, %1, 8), %%mm5 \n\t"


203 
"lea (%0, %1, 4), %%"REG_a" \n\t"


204 
"lea (%0, %1, 8), %%"REG_c" \n\t"


205 
"sub %1, %%"REG_c" \n\t"


206 
"add %1, %0 \n\t" // %0 points to line 1 not 0


207 
"movq (%0, %1, 8), %%mm7 \n\t"


208 
"movq %%mm5, %%mm1 \n\t"


209 
"movq %%mm7, %%mm2 \n\t"


210 
"psubusb %%mm7, %%mm5 \n\t"


211 
"psubusb %%mm1, %%mm2 \n\t"


212 
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines


213 
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP > 0


214 
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP > FF


215  
216 
"pand %%mm2, %%mm7 \n\t"


217 
"pandn %%mm1, %%mm2 \n\t"


218 
"por %%mm2, %%mm7 \n\t" // First Line to Filter


219  
220  
221 
// 1 2 3 4 5 6 7 8


222 
// %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1


223 
// 6 4 2 2 1 1


224 
// 6 4 4 2


225 
// 6 8 2


226  
227 
"movq (%0, %1), %%mm0 \n\t" // 1


228 
"movq %%mm0, %%mm1 \n\t" // 1


229 
PAVGB(%%mm6, %%mm0) //1 1 /2


230 
PAVGB(%%mm6, %%mm0) //3 1 /4


231  
232 
"movq (%0, %1, 4), %%mm2 \n\t" // 1


233 
"movq %%mm2, %%mm5 \n\t" // 1


234 
PAVGB((%%REGa), %%mm2) // 11 /2


235 
PAVGB((%0, %1, 2), %%mm2) // 211 /4


236 
"movq %%mm2, %%mm3 \n\t" // 211 /4


237 
"movq (%0), %%mm4 \n\t" // 1


238 
PAVGB(%%mm4, %%mm3) // 4 211 /8


239 
PAVGB(%%mm0, %%mm3) //642211 /16


240 
"movq %%mm3, (%0) \n\t" // X


241 
// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9


242 
"movq %%mm1, %%mm0 \n\t" // 1


243 
PAVGB(%%mm6, %%mm0) //1 1 /2


244 
"movq %%mm4, %%mm3 \n\t" // 1


245 
PAVGB((%0,%1,2), %%mm3) // 1 1 /2


246 
PAVGB((%%REGa,%1,2), %%mm5) // 11 /2


247 
PAVGB((%%REGa), %%mm5) // 211 /4


248 
PAVGB(%%mm5, %%mm3) // 2 2211 /8


249 
PAVGB(%%mm0, %%mm3) //4242211 /16


250 
"movq %%mm3, (%0,%1) \n\t" // X


251 
// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9


252 
PAVGB(%%mm4, %%mm6) //11 /2


253 
"movq (%%"REG_c"), %%mm0 \n\t" // 1


254 
PAVGB((%%REGa, %1, 2), %%mm0) // 11/2


255 
"movq %%mm0, %%mm3 \n\t" // 11/2


256 
PAVGB(%%mm1, %%mm0) // 2 11/4


257 
PAVGB(%%mm6, %%mm0) //222 11/8


258 
PAVGB(%%mm2, %%mm0) //22242211/16


259 
"movq (%0, %1, 2), %%mm2 \n\t" // 1


260 
"movq %%mm0, (%0, %1, 2) \n\t" // X


261 
// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9


262 
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1


263 
PAVGB((%%REGc), %%mm0) // 11 /2


264 
PAVGB(%%mm0, %%mm6) //11 11 /4


265 
PAVGB(%%mm1, %%mm4) // 11 /2


266 
PAVGB(%%mm2, %%mm1) // 11 /2


267 
PAVGB(%%mm1, %%mm6) //1122 11 /8


268 
PAVGB(%%mm5, %%mm6) //112242211 /16


269 
"movq (%%"REG_a"), %%mm5 \n\t" // 1


270 
"movq %%mm6, (%%"REG_a") \n\t" // X


271 
// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9


272 
"movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1


273 
PAVGB(%%mm7, %%mm6) // 11 /2


274 
PAVGB(%%mm4, %%mm6) // 11 11 /4


275 
PAVGB(%%mm3, %%mm6) // 11 2211 /8


276 
PAVGB(%%mm5, %%mm2) // 11 /2


277 
"movq (%0, %1, 4), %%mm4 \n\t" // 1


278 
PAVGB(%%mm4, %%mm2) // 112 /4


279 
PAVGB(%%mm2, %%mm6) // 112242211 /16


280 
"movq %%mm6, (%0, %1, 4) \n\t" // X


281 
// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9


282 
PAVGB(%%mm7, %%mm1) // 11 2 /4


283 
PAVGB(%%mm4, %%mm5) // 11 /2


284 
PAVGB(%%mm5, %%mm0) // 11 11 /4


285 
"movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1


286 
PAVGB(%%mm6, %%mm1) // 11 4 2 /8


287 
PAVGB(%%mm0, %%mm1) // 11224222 /16


288 
"movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X


289 
// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9


290 
PAVGB((%%REGc), %%mm2) // 112 4 /8


291 
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1


292 
PAVGB(%%mm0, %%mm6) // 1 1 /2


293 
PAVGB(%%mm7, %%mm6) // 1 12 /4


294 
PAVGB(%%mm2, %%mm6) // 1122424 /4


295 
"movq %%mm6, (%%"REG_c") \n\t" // X


296 
// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9


297 
PAVGB(%%mm7, %%mm5) // 11 2 /4


298 
PAVGB(%%mm7, %%mm5) // 11 6 /8


299  
300 
PAVGB(%%mm3, %%mm0) // 112 /4


301 
PAVGB(%%mm0, %%mm5) // 112246 /16


302 
"movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X


303 
"sub %1, %0 \n\t"


304  
305 
:


306 
: "r" (src), "r" ((long)stride), "m" (c>pQPb)


307 
: "%"REG_a, "%"REG_c


308 
);


183 
src+= stride*3; 

184 
asm volatile( //"movv %0 %1 %2\n\t" 

185 
"movq %2, %%mm0 \n\t" // QP,..., QP 

186 
"pxor %%mm4, %%mm4 \n\t" 

187  
188 
"movq (%0), %%mm6 \n\t" 

189 
"movq (%0, %1), %%mm5 \n\t" 

190 
"movq %%mm5, %%mm1 \n\t" 

191 
"movq %%mm6, %%mm2 \n\t" 

192 
"psubusb %%mm6, %%mm5 \n\t" 

193 
"psubusb %%mm1, %%mm2 \n\t" 

194 
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines 

195 
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP > 0 

196 
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP > FF 

197  
198 
"pand %%mm2, %%mm6 \n\t" 

199 
"pandn %%mm1, %%mm2 \n\t" 

200 
"por %%mm2, %%mm6 \n\t"// First Line to Filter 

201  
202 
"movq (%0, %1, 8), %%mm5 \n\t" 

203 
"lea (%0, %1, 4), %%"REG_a" \n\t" 

204 
"lea (%0, %1, 8), %%"REG_c" \n\t" 

205 
"sub %1, %%"REG_c" \n\t" 

206 
"add %1, %0 \n\t" // %0 points to line 1 not 0 

207 
"movq (%0, %1, 8), %%mm7 \n\t" 

208 
"movq %%mm5, %%mm1 \n\t" 

209 
"movq %%mm7, %%mm2 \n\t" 

210 
"psubusb %%mm7, %%mm5 \n\t" 

211 
"psubusb %%mm1, %%mm2 \n\t" 

212 
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines 

213 
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP > 0 

214 
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP > FF 

215  
216 
"pand %%mm2, %%mm7 \n\t" 

217 
"pandn %%mm1, %%mm2 \n\t" 

218 
"por %%mm2, %%mm7 \n\t" // First Line to Filter 

219  
220  
221 
// 1 2 3 4 5 6 7 8 

222 
// %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 

223 
// 6 4 2 2 1 1 

224 
// 6 4 4 2 

225 
// 6 8 2 

226  
227 
"movq (%0, %1), %%mm0 \n\t" // 1 

228 
"movq %%mm0, %%mm1 \n\t" // 1 

229 
PAVGB(%%mm6, %%mm0) //1 1 /2 

230 
PAVGB(%%mm6, %%mm0) //3 1 /4 

231  
232 
"movq (%0, %1, 4), %%mm2 \n\t" // 1 

233 
"movq %%mm2, %%mm5 \n\t" // 1 

234 
PAVGB((%%REGa), %%mm2) // 11 /2 

235 
PAVGB((%0, %1, 2), %%mm2) // 211 /4 

236 
"movq %%mm2, %%mm3 \n\t" // 211 /4 

237 
"movq (%0), %%mm4 \n\t" // 1 

238 
PAVGB(%%mm4, %%mm3) // 4 211 /8 

239 
PAVGB(%%mm0, %%mm3) //642211 /16 

240 
"movq %%mm3, (%0) \n\t" // X 

241 
// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 

242 
"movq %%mm1, %%mm0 \n\t" // 1 

243 
PAVGB(%%mm6, %%mm0) //1 1 /2 

244 
"movq %%mm4, %%mm3 \n\t" // 1 

245 
PAVGB((%0,%1,2), %%mm3) // 1 1 /2 

246 
PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 

247 
PAVGB((%%REGa), %%mm5) // 211 /4 

248 
PAVGB(%%mm5, %%mm3) // 2 2211 /8 

249 
PAVGB(%%mm0, %%mm3) //4242211 /16 

250 
"movq %%mm3, (%0,%1) \n\t" // X 

251 
// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 

252 
PAVGB(%%mm4, %%mm6) //11 /2 

253 
"movq (%%"REG_c"), %%mm0 \n\t" // 1 

254 
PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 

255 
"movq %%mm0, %%mm3 \n\t" // 11/2 

256 
PAVGB(%%mm1, %%mm0) // 2 11/4 

257 
PAVGB(%%mm6, %%mm0) //222 11/8 

258 
PAVGB(%%mm2, %%mm0) //22242211/16 

259 
"movq (%0, %1, 2), %%mm2 \n\t" // 1 

260 
"movq %%mm0, (%0, %1, 2) \n\t" // X 

261 
// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 

262 
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 

263 
PAVGB((%%REGc), %%mm0) // 11 /2 

264 
PAVGB(%%mm0, %%mm6) //11 11 /4 

265 
PAVGB(%%mm1, %%mm4) // 11 /2 

266 
PAVGB(%%mm2, %%mm1) // 11 /2 

267 
PAVGB(%%mm1, %%mm6) //1122 11 /8 

268 
PAVGB(%%mm5, %%mm6) //112242211 /16 

269 
"movq (%%"REG_a"), %%mm5 \n\t" // 1 

270 
"movq %%mm6, (%%"REG_a") \n\t" // X 

271 
// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 

272 
"movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 

273 
PAVGB(%%mm7, %%mm6) // 11 /2 

274 
PAVGB(%%mm4, %%mm6) // 11 11 /4 

275 
PAVGB(%%mm3, %%mm6) // 11 2211 /8 

276 
PAVGB(%%mm5, %%mm2) // 11 /2 

277 
"movq (%0, %1, 4), %%mm4 \n\t" // 1 

278 
PAVGB(%%mm4, %%mm2) // 112 /4 

279 
PAVGB(%%mm2, %%mm6) // 112242211 /16 

280 
"movq %%mm6, (%0, %1, 4) \n\t" // X 

281 
// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 

282 
PAVGB(%%mm7, %%mm1) // 11 2 /4 

283 
PAVGB(%%mm4, %%mm5) // 11 /2 

284 
PAVGB(%%mm5, %%mm0) // 11 11 /4 

285 
"movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 

286 
PAVGB(%%mm6, %%mm1) // 11 4 2 /8 

287 
PAVGB(%%mm0, %%mm1) // 11224222 /16 

288 
"movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X 

289 
// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 

290 
PAVGB((%%REGc), %%mm2) // 112 4 /8 

291 
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 

292 
PAVGB(%%mm0, %%mm6) // 1 1 /2 

293 
PAVGB(%%mm7, %%mm6) // 1 12 /4 

294 
PAVGB(%%mm2, %%mm6) // 1122424 /4 

295 
"movq %%mm6, (%%"REG_c") \n\t" // X 

296 
// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 

297 
PAVGB(%%mm7, %%mm5) // 11 2 /4 

298 
PAVGB(%%mm7, %%mm5) // 11 6 /8 

299  
300 
PAVGB(%%mm3, %%mm0) // 112 /4 

301 
PAVGB(%%mm0, %%mm5) // 112246 /16 

302 
"movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X 

303 
"sub %1, %0 \n\t" 

304  
305 
: 

306 
: "r" (src), "r" ((long)stride), "m" (c>pQPb) 

307 
: "%"REG_a, "%"REG_c 

308 
); 

309  309 
#else //defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
310 
const int l1= stride; 

311 
const int l2= stride + l1; 

312 
const int l3= stride + l2; 

313 
const int l4= stride + l3; 

314 
const int l5= stride + l4; 

315 
const int l6= stride + l5; 

316 
const int l7= stride + l6; 

317 
const int l8= stride + l7; 

318 
const int l9= stride + l8; 

319 
int x; 

320 
src+= stride*3; 

321 
for(x=0; x<BLOCK_SIZE; x++) 

322 
{ 

323 
const int first= FFABS(src[0]  src[l1]) < c>QP ? src[0] : src[l1]; 

324 
const int last= FFABS(src[l8]  src[l9]) < c>QP ? src[l9] : src[l8]; 

325  
326 
int sums[10]; 

327 
sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; 

328 
sums[1] = sums[0]  first + src[l4]; 

329 
sums[2] = sums[1]  first + src[l5]; 

330 
sums[3] = sums[2]  first + src[l6]; 

331 
sums[4] = sums[3]  first + src[l7]; 

332 
sums[5] = sums[4]  src[l1] + src[l8]; 

333 
sums[6] = sums[5]  src[l2] + last; 

334 
sums[7] = sums[6]  src[l3] + last; 

335 
sums[8] = sums[7]  src[l4] + last; 

336 
sums[9] = sums[8]  src[l5] + last; 

337  
338 
src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; 

339 
src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; 

340 
src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; 

341 
src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; 

342 
src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; 

343 
src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; 

344 
src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 

345 
src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 

346  
347 
src++; 

348 
} 

310 
const int l1= stride; 

311 
const int l2= stride + l1; 

312 
const int l3= stride + l2; 

313 
const int l4= stride + l3; 

314 
const int l5= stride + l4; 

315 
const int l6= stride + l5; 

316 
const int l7= stride + l6; 

317 
const int l8= stride + l7; 

318 
const int l9= stride + l8; 

319 
int x; 

320 
src+= stride*3; 

321 
for(x=0; x<BLOCK_SIZE; x++){ 

322 
const int first= FFABS(src[0]  src[l1]) < c>QP ? src[0] : src[l1]; 

323 
const int last= FFABS(src[l8]  src[l9]) < c>QP ? src[l9] : src[l8]; 

324  
325 
int sums[10]; 

326 
sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; 

327 
sums[1] = sums[0]  first + src[l4]; 

328 
sums[2] = sums[1]  first + src[l5]; 

329 
sums[3] = sums[2]  first + src[l6]; 

330 
sums[4] = sums[3]  first + src[l7]; 

331 
sums[5] = sums[4]  src[l1] + src[l8]; 

332 
sums[6] = sums[5]  src[l2] + last; 

333 
sums[7] = sums[6]  src[l3] + last; 

334 
sums[8] = sums[7]  src[l4] + last; 

335 
sums[9] = sums[8]  src[l5] + last; 

336  
337 
src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; 

338 
src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; 

339 
src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; 

340 
src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; 

341 
src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; 

342 
src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; 

343 
src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 

344 
src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 

345  
346 
src++; 

347 
} 

349  348 
#endif //defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
350  349 
} 
351  350 
#endif //HAVE_ALTIVEC 
...  ...  
366  365 
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) 
367  366 
{ 
368  367 
#if defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
369 
src+= stride*3;


368 
src+= stride*3; 

370  369 
// FIXME rounding 
371 
asm volatile(


372 
"pxor %%mm7, %%mm7 \n\t" // 0


373 
"movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE


374 
"leal (%0, %1), %%"REG_a" \n\t"


375 
"leal (%%"REG_a", %1, 4), %%"REG_c" \n\t"


370 
asm volatile( 

371 
"pxor %%mm7, %%mm7 \n\t" // 0 

372 
"movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE 

373 
"leal (%0, %1), %%"REG_a" \n\t" 

374 
"leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" 

376  375 
// 0 1 2 3 4 5 6 7 8 9 
377  376 
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 
378 
"movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP


379 
"movq %%mm0, %%mm1 \n\t" // QP,..., QP


380 
"paddusb "MANGLE(b02)", %%mm0 \n\t"


381 
"psrlw $2, %%mm0 \n\t"


382 
"pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4


383 
"paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...


384 
"movq (%0, %1, 4), %%mm2 \n\t" // line 4


385 
"movq (%%"REG_c"), %%mm3 \n\t" // line 5


386 
"movq %%mm2, %%mm4 \n\t" // line 4


387 
"pcmpeqb %%mm5, %%mm5 \n\t" // 1


388 
"pxor %%mm2, %%mm5 \n\t" // line 4  1


389 
PAVGB(%%mm3, %%mm5)


390 
"paddb %%mm6, %%mm5 \n\t" // (l5l4)/2


391 
"psubusb %%mm3, %%mm4 \n\t"


392 
"psubusb %%mm2, %%mm3 \n\t"


393 
"por %%mm3, %%mm4 \n\t" // l4  l5


394 
"psubusb %%mm0, %%mm4 \n\t"


395 
"pcmpeqb %%mm7, %%mm4 \n\t"


396 
"pand %%mm4, %%mm5 \n\t" // d/2


397  
398 
// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80


399 
"paddb %%mm5, %%mm2 \n\t"


400 
// "psubb %%mm6, %%mm2 \n\t"


401 
"movq %%mm2, (%0,%1, 4) \n\t"


402  
403 
"movq (%%"REG_c"), %%mm2 \n\t"


404 
// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80


405 
"psubb %%mm5, %%mm2 \n\t"


406 
// "psubb %%mm6, %%mm2 \n\t"


407 
"movq %%mm2, (%%"REG_c") \n\t"


408  
409 
"paddb %%mm6, %%mm5 \n\t"


410 
"psrlw $2, %%mm5 \n\t"


411 
"pand "MANGLE(b3F)", %%mm5 \n\t"


412 
"psubb "MANGLE(b20)", %%mm5 \n\t" // (l5l4)/8


413  
414 
"movq (%%"REG_a", %1, 2), %%mm2 \n\t"


415 
"paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80


416 
"paddsb %%mm5, %%mm2 \n\t"


417 
"psubb %%mm6, %%mm2 \n\t"


418 
"movq %%mm2, (%%"REG_a", %1, 2) \n\t"


419  
420 
"movq (%%"REG_c", %1), %%mm2 \n\t"


421 
"paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80


422 
"psubsb %%mm5, %%mm2 \n\t"


423 
"psubb %%mm6, %%mm2 \n\t"


424 
"movq %%mm2, (%%"REG_c", %1) \n\t"


425  
426 
:


427 
: "r" (src), "r" ((long)stride)


428 
: "%"REG_a, "%"REG_c


429 
);


377 
"movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP 

378 
"movq %%mm0, %%mm1 \n\t" // QP,..., QP 

379 
"paddusb "MANGLE(b02)", %%mm0 \n\t" 

380 
"psrlw $2, %%mm0 \n\t" 

381 
"pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 

382 
"paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... 

383 
"movq (%0, %1, 4), %%mm2 \n\t" // line 4 

384 
"movq (%%"REG_c"), %%mm3 \n\t" // line 5 

385 
"movq %%mm2, %%mm4 \n\t" // line 4 

386 
"pcmpeqb %%mm5, %%mm5 \n\t" // 1 

387 
"pxor %%mm2, %%mm5 \n\t" // line 4  1 

388 
PAVGB(%%mm3, %%mm5) 

389 
"paddb %%mm6, %%mm5 \n\t" // (l5l4)/2 

390 
"psubusb %%mm3, %%mm4 \n\t" 

391 
"psubusb %%mm2, %%mm3 \n\t" 

392 
"por %%mm3, %%mm4 \n\t" // l4  l5 

393 
"psubusb %%mm0, %%mm4 \n\t" 

394 
"pcmpeqb %%mm7, %%mm4 \n\t" 

395 
"pand %%mm4, %%mm5 \n\t" // d/2 

396  
397 
// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 

398 
"paddb %%mm5, %%mm2 \n\t" 

399 
// "psubb %%mm6, %%mm2 \n\t" 

400 
"movq %%mm2, (%0,%1, 4) \n\t" 

401  
402 
"movq (%%"REG_c"), %%mm2 \n\t" 

403 
// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 

404 
"psubb %%mm5, %%mm2 \n\t" 

405 
// "psubb %%mm6, %%mm2 \n\t" 

406 
"movq %%mm2, (%%"REG_c") \n\t" 

407  
408 
"paddb %%mm6, %%mm5 \n\t" 

409 
"psrlw $2, %%mm5 \n\t" 

410 
"pand "MANGLE(b3F)", %%mm5 \n\t" 

411 
"psubb "MANGLE(b20)", %%mm5 \n\t" // (l5l4)/8 

412  
413 
"movq (%%"REG_a", %1, 2), %%mm2 \n\t" 

414 
"paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 

415 
"paddsb %%mm5, %%mm2 \n\t" 

416 
"psubb %%mm6, %%mm2 \n\t" 

417 
"movq %%mm2, (%%"REG_a", %1, 2) \n\t" 

418  
419 
"movq (%%"REG_c", %1), %%mm2 \n\t" 

420 
"paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 

421 
"psubsb %%mm5, %%mm2 \n\t" 

422 
"psubb %%mm6, %%mm2 \n\t" 

423 
"movq %%mm2, (%%"REG_c", %1) \n\t" 

424  
425 
: 

426 
: "r" (src), "r" ((long)stride) 

427 
: "%"REG_a, "%"REG_c 

428 
); 

430  429 
#else //defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
431 
const int l1= stride; 

432 
const int l2= stride + l1; 

433 
const int l3= stride + l2; 

434 
const int l4= stride + l3; 

435 
const int l5= stride + l4; 

436 
const int l6= stride + l5; 

437 
// const int l7= stride + l6; 

438 
// const int l8= stride + l7; 

439 
// const int l9= stride + l8; 

440 
int x; 

441 
const int QP15= QP + (QP>>2); 

442 
src+= stride*3; 

443 
for(x=0; x<BLOCK_SIZE; x++) 

444 
{ 

445 
const int v = (src[x+l5]  src[x+l4]); 

446 
if(FFABS(v) < QP15) 

447 
{ 

448 
src[x+l3] +=v>>3; 

449 
src[x+l4] +=v>>1; 

450 
src[x+l5] =v>>1; 

451 
src[x+l6] =v>>3; 

452  
453 
} 

430 
const int l1= stride; 

431 
const int l2= stride + l1; 

432 
const int l3= stride + l2; 

433 
const int l4= stride + l3; 

434 
const int l5= stride + l4; 

435 
const int l6= stride + l5; 

436 
// const int l7= stride + l6; 

437 
// const int l8= stride + l7; 

438 
// const int l9= stride + l8; 

439 
int x; 

440 
const int QP15= QP + (QP>>2); 

441 
src+= stride*3; 

442 
for(x=0; x<BLOCK_SIZE; x++){ 

443 
const int v = (src[x+l5]  src[x+l4]); 

444 
if(FFABS(v) < QP15){ 

445 
src[x+l3] +=v>>3; 

446 
src[x+l4] +=v>>1; 

447 
src[x+l5] =v>>1; 

448 
src[x+l6] =v>>3; 

454  449 
} 
450 
} 

455  451  
456  452 
#endif //defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
457  453 
} 
...  ...  
467  463 
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 
468  464 
{ 
469  465 
#if defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
470 
src+= stride*3;


466 
src+= stride*3; 

471  467  
472 
asm volatile(


473 
"pxor %%mm7, %%mm7 \n\t" // 0


474 
"lea (%0, %1), %%"REG_a" \n\t"


475 
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"


468 
asm volatile( 

469 
"pxor %%mm7, %%mm7 \n\t" // 0 

470 
"lea (%0, %1), %%"REG_a" \n\t" 

471 
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 

476  472 
// 0 1 2 3 4 5 6 7 8 9 
477  473 
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 
478 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3


479 
"movq (%0, %1, 4), %%mm1 \n\t" // line 4


480 
"movq %%mm1, %%mm2 \n\t" // line 4


481 
"psubusb %%mm0, %%mm1 \n\t"


482 
"psubusb %%mm2, %%mm0 \n\t"


483 
"por %%mm1, %%mm0 \n\t" // l2  l3


484 
"movq (%%"REG_c"), %%mm3 \n\t" // line 5


485 
"movq (%%"REG_c", %1), %%mm4 \n\t" // line 6


486 
"movq %%mm3, %%mm5 \n\t" // line 5


487 
"psubusb %%mm4, %%mm3 \n\t"


488 
"psubusb %%mm5, %%mm4 \n\t"


489 
"por %%mm4, %%mm3 \n\t" // l5  l6


490 
PAVGB(%%mm3, %%mm0) // (l2  l3 + l5  l6)/2


491 
"movq %%mm2, %%mm1 \n\t" // line 4


492 
"psubusb %%mm5, %%mm2 \n\t"


493 
"movq %%mm2, %%mm4 \n\t"


494 
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4  l5) <= 0 ? 1 : 0


495 
"psubusb %%mm1, %%mm5 \n\t"


496 
"por %%mm5, %%mm4 \n\t" // l4  l5


497 
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, l4l5  (l2l3 + l5l6)/2)


498 
"movq %%mm4, %%mm3 \n\t" // d


499 
"movq %2, %%mm0 \n\t"


500 
"paddusb %%mm0, %%mm0 \n\t"


501 
"psubusb %%mm0, %%mm4 \n\t"


502 
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? 1 : 0


503 
"psubusb "MANGLE(b01)", %%mm3 \n\t"


504 
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0


505  
506 
PAVGB(%%mm7, %%mm3) // d/2


507 
"movq %%mm3, %%mm1 \n\t" // d/2


508 
PAVGB(%%mm7, %%mm3) // d/4


509 
PAVGB(%%mm1, %%mm3) // 3*d/8


510  
511 
"movq (%0, %1, 4), %%mm0 \n\t" // line 4


512 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l41 : l4


513 
"psubusb %%mm3, %%mm0 \n\t"


514 
"pxor %%mm2, %%mm0 \n\t"


515 
"movq %%mm0, (%0, %1, 4) \n\t" // line 4


516  
517 
"movq (%%"REG_c"), %%mm0 \n\t" // line 5


518 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l51 : l5


519 
"paddusb %%mm3, %%mm0 \n\t"


520 
"pxor %%mm2, %%mm0 \n\t"


521 
"movq %%mm0, (%%"REG_c") \n\t" // line 5


522  
523 
PAVGB(%%mm7, %%mm1) // d/4


524  
525 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3


526 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l41 : l4


527 
"psubusb %%mm1, %%mm0 \n\t"


528 
"pxor %%mm2, %%mm0 \n\t"


529 
"movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3


530  
531 
"movq (%%"REG_c", %1), %%mm0 \n\t" // line 6


532 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l51 : l5


533 
"paddusb %%mm1, %%mm0 \n\t"


534 
"pxor %%mm2, %%mm0 \n\t"


535 
"movq %%mm0, (%%"REG_c", %1) \n\t" // line 6


536  
537 
PAVGB(%%mm7, %%mm1) // d/8


538  
539 
"movq (%%"REG_a", %1), %%mm0 \n\t" // line 2


540 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l21 : l2


541 
"psubusb %%mm1, %%mm0 \n\t"


542 
"pxor %%mm2, %%mm0 \n\t"


543 
"movq %%mm0, (%%"REG_a", %1) \n\t" // line 2


544  
545 
"movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7


546 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l71 : l7


547 
"paddusb %%mm1, %%mm0 \n\t"


548 
"pxor %%mm2, %%mm0 \n\t"


549 
"movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7


550  
551 
:


552 
: "r" (src), "r" ((long)stride), "m" (co>pQPb)


553 
: "%"REG_a, "%"REG_c


554 
);


474 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 

475 
"movq (%0, %1, 4), %%mm1 \n\t" // line 4 

476 
"movq %%mm1, %%mm2 \n\t" // line 4 

477 
"psubusb %%mm0, %%mm1 \n\t" 

478 
"psubusb %%mm2, %%mm0 \n\t" 

479 
"por %%mm1, %%mm0 \n\t" // l2  l3 

480 
"movq (%%"REG_c"), %%mm3 \n\t" // line 5 

481 
"movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 

482 
"movq %%mm3, %%mm5 \n\t" // line 5 

483 
"psubusb %%mm4, %%mm3 \n\t" 

484 
"psubusb %%mm5, %%mm4 \n\t" 

485 
"por %%mm4, %%mm3 \n\t" // l5  l6 

486 
PAVGB(%%mm3, %%mm0) // (l2  l3 + l5  l6)/2 

487 
"movq %%mm2, %%mm1 \n\t" // line 4 

488 
"psubusb %%mm5, %%mm2 \n\t" 

489 
"movq %%mm2, %%mm4 \n\t" 

490 
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4  l5) <= 0 ? 1 : 0 

491 
"psubusb %%mm1, %%mm5 \n\t" 

492 
"por %%mm5, %%mm4 \n\t" // l4  l5 

493 
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, l4l5  (l2l3 + l5l6)/2) 

494 
"movq %%mm4, %%mm3 \n\t" // d 

495 
"movq %2, %%mm0 \n\t" 

496 
"paddusb %%mm0, %%mm0 \n\t" 

497 
"psubusb %%mm0, %%mm4 \n\t" 

498 
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? 1 : 0 

499 
"psubusb "MANGLE(b01)", %%mm3 \n\t" 

500 
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 

501  
502 
PAVGB(%%mm7, %%mm3) // d/2 

503 
"movq %%mm3, %%mm1 \n\t" // d/2 

504 
PAVGB(%%mm7, %%mm3) // d/4 

505 
PAVGB(%%mm1, %%mm3) // 3*d/8 

506  
507 
"movq (%0, %1, 4), %%mm0 \n\t" // line 4 

508 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l41 : l4 

509 
"psubusb %%mm3, %%mm0 \n\t" 

510 
"pxor %%mm2, %%mm0 \n\t" 

511 
"movq %%mm0, (%0, %1, 4) \n\t" // line 4 

512  
513 
"movq (%%"REG_c"), %%mm0 \n\t" // line 5 

514 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l51 : l5 

515 
"paddusb %%mm3, %%mm0 \n\t" 

516 
"pxor %%mm2, %%mm0 \n\t" 

517 
"movq %%mm0, (%%"REG_c") \n\t" // line 5 

518  
519 
PAVGB(%%mm7, %%mm1) // d/4 

520  
521 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 

522 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l41 : l4 

523 
"psubusb %%mm1, %%mm0 \n\t" 

524 
"pxor %%mm2, %%mm0 \n\t" 

525 
"movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 

526  
527 
"movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 

528 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l51 : l5 

529 
"paddusb %%mm1, %%mm0 \n\t" 

530 
"pxor %%mm2, %%mm0 \n\t" 

531 
"movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 

532  
533 
PAVGB(%%mm7, %%mm1) // d/8 

534  
535 
"movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 

536 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l21 : l2 

537 
"psubusb %%mm1, %%mm0 \n\t" 

538 
"pxor %%mm2, %%mm0 \n\t" 

539 
"movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 

540  
541 
"movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 

542 
"pxor %%mm2, %%mm0 \n\t" //(l4  l5) <= 0 ? l71 : l7 

543 
"paddusb %%mm1, %%mm0 \n\t" 

544 
"pxor %%mm2, %%mm0 \n\t" 

545 
"movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 

546  
547 
: 

548 
: "r" (src), "r" ((long)stride), "m" (co>pQPb) 

549 
: "%"REG_a, "%"REG_c 

550 
); 

555  551 
#else //defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
556  552  
557 
const int l1= stride; 

558 
const int l2= stride + l1; 

559 
const int l3= stride + l2; 

560 
const int l4= stride + l3; 

561 
const int l5= stride + l4; 

562 
const int l6= stride + l5; 

563 
const int l7= stride + l6; 

564 
// const int l8= stride + l7; 

565 
// const int l9= stride + l8; 

566 
int x; 

567  
568 
src+= stride*3; 

569 
for(x=0; x<BLOCK_SIZE; x++) 

570 
{ 

571 
int a= src[l3]  src[l4]; 

572 
int b= src[l4]  src[l5]; 

573 
int c= src[l5]  src[l6]; 

574  
575 
int d= FFABS(b)  ((FFABS(a) + FFABS(c))>>1); 

576 
d= FFMAX(d, 0); 

577  
578 
if(d < co>QP*2) 

579 
{ 

580 
int v = d * FFSIGN(b); 

581  
582 
src[l2] +=v>>3; 

583 
src[l3] +=v>>2; 

584 
src[l4] +=(3*v)>>3; 

585 
src[l5] =(3*v)>>3; 

586 
src[l6] =v>>2; 

587 
src[l7] =v>>3; 

588  
589 
} 

590 
src++; 

553 
const int l1= stride; 

554 
const int l2= stride + l1; 

555 
const int l3= stride + l2; 

556 
const int l4= stride + l3; 

557 
const int l5= stride + l4; 

558 
const int l6= stride + l5; 

559 
const int l7= stride + l6; 

560 
// const int l8= stride + l7; 

561 
// const int l9= stride + l8; 

562 
int x; 

563  
564 
src+= stride*3; 

565 
for(x=0; x<BLOCK_SIZE; x++){ 

566 
int a= src[l3]  src[l4]; 

567 
int b= src[l4]  src[l5]; 

568 
int c= src[l5]  src[l6]; 

569  
570 
int d= FFABS(b)  ((FFABS(a) + FFABS(c))>>1); 

571 
d= FFMAX(d, 0); 

572  
573 
if(d < co>QP*2){ 

574 
int v = d * FFSIGN(b); 

575  
576 
src[l2] +=v>>3; 

577 
src[l3] +=v>>2; 

578 
src[l4] +=(3*v)>>3; 

579 
src[l5] =(3*v)>>3; 

580 
src[l6] =v>>2; 

581 
src[l7] =v>>3; 

591  582 
} 
583 
src++; 

584 
} 

592  585 
#endif //defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
593  586 
} 
594  587  
...  ...  
597  590 
{ 
598  591 
#if defined (HAVE_MMX2)  defined (HAVE_3DNOW) 
599  592 
/* 
600 
uint8_t tmp[16];


601 
const int l1= stride;


602 
const int l2= stride + l1;


603 
const int l3= stride + l2;


604 
const int l4= (int)tmp  (int)src  stride*3;


605 
const int l5= (int)tmp  (int)src  stride*3 + 8;


606 
const int l6= stride*3 + l3;


607 
const int l7= stride + l6;


608 
const int l8= stride + l7;


609  
610 
memcpy(tmp, src+stride*7, 8);


611 
memcpy(tmp+8, src+stride*8, 8);


593 
uint8_t tmp[16]; 

594 
const int l1= stride; 

595 
const int l2= stride + l1; 

596 
const int l3= stride + l2; 

597 
const int l4= (int)tmp  (int)src  stride*3; 

598 
const int l5= (int)tmp  (int)src  stride*3 + 8; 

599 
const int l6= stride*3 + l3; 

600 
const int l7= stride + l6; 

601 
const int l8= stride + l7; 

602  
603 
memcpy(tmp, src+stride*7, 8); 

604 
memcpy(tmp+8, src+stride*8, 8); 

612  605 
*/ 
613 
src+= stride*4;


614 
asm volatile(


606 
src+= stride*4; 

607 
asm volatile( 

615  608  
616  609 
#if 0 //sligtly more accurate and slightly slower 
617 
"pxor %%mm7, %%mm7 \n\t" // 0


618 
"lea (%0, %1), %%"REG_a" \n\t"


619 
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"


610 
"pxor %%mm7, %%mm7 \n\t" // 0 

611 
"lea (%0, %1), %%"REG_a" \n\t" 

612 
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 

620  613 
// 0 1 2 3 4 5 6 7 
621  614 
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 
622  615 
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 
623  616  
624  617  
625 
"movq (%0, %1, 2), %%mm0 \n\t" // l2


626 
"movq (%0), %%mm1 \n\t" // l0


627 
"movq %%mm0, %%mm2 \n\t" // l2


628 
PAVGB(%%mm7, %%mm0) // ~l2/2


629 
PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4


630 
PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8


631  
632 
"movq (%%"REG_a"), %%mm1 \n\t" // l1


633 
"movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3


634 
"movq %%mm1, %%mm4 \n\t" // l1


635 
PAVGB(%%mm7, %%mm1) // ~l1/2


636 
PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4


637 
PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8


638  
639 
"movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8


640 
"psubusb %%mm1, %%mm0 \n\t"


641 
"psubusb %%mm4, %%mm1 \n\t"


642 
"por %%mm0, %%mm1 \n\t" // ~2l0  5l1 + 5l2  2l3/8


618 
"movq (%0, %1, 2), %%mm0 \n\t" // l2 

619 
"movq (%0), %%mm1 \n\t" // l0 

620 
"movq %%mm0, %%mm2 \n\t" // l2 

621 
PAVGB(%%mm7, %%mm0) // ~l2/2 

622 
PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 

623 
PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 

624  
625 
"movq (%%"REG_a"), %%mm1 \n\t" // l1 

626 
"movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 

627 
"movq %%mm1, %%mm4 \n\t" // l1 

628 
PAVGB(%%mm7, %%mm1) // ~l1/2 

629 
PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 

630 
PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 

631  
632 
"movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 

633 
"psubusb %%mm1, %%mm0 \n\t" 

634 
"psubusb %%mm4, %%mm1 \n\t" 

635 
"por %%mm0, %%mm1 \n\t" // ~2l0  5l1 + 5l2  2l3/8 

643  636 
// mm1= lenergy, mm2= l2, mm3= l3, mm7=0 
644  637  
645 
"movq (%0, %1, 4), %%mm0 \n\t" // l4


646 
"movq %%mm0, %%mm4 \n\t" // l4


647 
PAVGB(%%mm7, %%mm0) // ~l4/2


648 
PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4


649 
PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8


650  
651 
"movq (%%"REG_c"), %%mm2 \n\t" // l5


652 
"movq %%mm3, %%mm5 \n\t" // l3


653 
PAVGB(%%mm7, %%mm3) // ~l3/2


654 
PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4


655 
PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8


656  
657 
"movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8


658 
"psubusb %%mm3, %%mm0 \n\t"


659 
"psubusb %%mm6, %%mm3 \n\t"


660 
"por %%mm0, %%mm3 \n\t" // ~2l2  5l3 + 5l4  2l5/8


661 
"pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2  5l3 + 5l4  2l5)


638 
"movq (%0, %1, 4), %%mm0 \n\t" // l4 

639 
"movq %%mm0, %%mm4 \n\t" // l4 

640 
PAVGB(%%mm7, %%mm0) // ~l4/2 

641 
PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 

642 
PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 

643  
644 
"movq (%%"REG_c"), %%mm2 \n\t" // l5 

645 
"movq %%mm3, %%mm5 \n\t" // l3 

646 
PAVGB(%%mm7, %%mm3) // ~l3/2 

647 
PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 

648 
PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 

649  
650 
"movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 

651 
"psubusb %%mm3, %%mm0 \n\t" 

652 
"psubusb %%mm6, %%mm3 \n\t" 

653 
"por %%mm0, %%mm3 \n\t" // ~2l2  5l3 + 5l4  2l5/8 

654 
"pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2  5l3 + 5l4  2l5) 

662  655 
// mm0= SIGN(menergy), mm1= lenergy, mm2= l5, mm3= menergy, mm4=l4, mm5= l3, mm7=0 
663  656  
664 
"movq (%%"REG_c", %1), %%mm6 \n\t" // l6


665 
"movq %%mm6, %%mm5 \n\t" // l6


666 
PAVGB(%%mm7, %%mm6) // ~l6/2


667 
PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4


668 
PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8


669  
670 
"movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7


671 
"movq %%mm2, %%mm4 \n\t" // l5


672 
PAVGB(%%mm7, %%mm2) // ~l5/2


673 
PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4


674 
PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8


675  
676 
"movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8


677 
"psubusb %%mm2, %%mm6 \n\t"


678 
"psubusb %%mm4, %%mm2 \n\t"


679 
"por %%mm6, %%mm2 \n\t" // ~2l4  5l5 + 5l6  2l7/8


657 
"movq (%%"REG_c", %1), %%mm6 \n\t" // l6 

658 
"movq %%mm6, %%mm5 \n\t" // l6 

659 
PAVGB(%%mm7, %%mm6) // ~l6/2 

660 
PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 

661 
PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 

662  
663 
"movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 

664 
"movq %%mm2, %%mm4 \n\t" // l5 

665 
PAVGB(%%mm7, %%mm2) // ~l5/2 

666 
PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 

667 
PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 

668  
669 
"movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 

670 
"psubusb %%mm2, %%mm6 \n\t" 

671 
"psubusb %%mm4, %%mm2 \n\t" 

672 
"por %%mm6, %%mm2 \n\t" // ~2l4  5l5 + 5l6  2l7/8 

680  673 
// mm0= SIGN(menergy), mm1= lenergy/8, mm2= renergy/8, mm3= menergy/8, mm7=0 
681  674  
682  675  
683 
PMINUB(%%mm2, %%mm1, %%mm4) // MIN(lenergy,renergy)/8


684 
"movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?


685 
"paddusb "MANGLE(b01)", %%mm4 \n\t"


686 
"pcmpgtb %%mm3, %%mm4 \n\t" // menergy/8 < QP


687 
"psubusb %%mm1, %%mm3 \n\t" // d=menergy/8MIN(lenergy,renergy)/8


688 
"pand %%mm4, %%mm3 \n\t"


689  
690 
"movq %%mm3, %%mm1 \n\t"


691 
// "psubusb "MANGLE(b01)", %%mm3 \n\t"


692 
PAVGB(%%mm7, %%mm3)


693 
PAVGB(%%mm7, %%mm3)


694 
"paddusb %%mm1, %%mm3 \n\t"


695 
// "paddusb "MANGLE(b01)", %%mm3 \n\t"


696  
697 
"movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3


698 
"movq (%0, %1, 4), %%mm5 \n\t" //l4


699 
"movq (%0, %1, 4), %%mm4 \n\t" //l4


700 
"psubusb %%mm6, %%mm5 \n\t"


701 
"psubusb %%mm4, %%mm6 \n\t"


702 
"por %%mm6, %%mm5 \n\t" // l3l4


703 
"pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3l4)


704 
"pxor %%mm6, %%mm0 \n\t"


705 
"pand %%mm0, %%mm3 \n\t"


706 
PMINUB(%%mm5, %%mm3, %%mm0)


707  
708 
"psubusb "MANGLE(b01)", %%mm3 \n\t"


709 
PAVGB(%%mm7, %%mm3)


710  
711 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t"


712 
"movq (%0, %1, 4), %%mm2 \n\t"


713 
"pxor %%mm6, %%mm0 \n\t"


714 
"pxor %%mm6, %%mm2 \n\t"


715 
"psubb %%mm3, %%mm0 \n\t"


716 
"paddb %%mm3, %%mm2 \n\t"


717 
"pxor %%mm6, %%mm0 \n\t"


718 
"pxor %%mm6, %%mm2 \n\t"


719 
"movq %%mm0, (%%"REG_a", %1, 2) \n\t"


720 
"movq %%mm2, (%0, %1, 4) \n\t"


676 
PMINUB(%%mm2, %%mm1, %%mm4) // MIN(lenergy,renergy)/8 

677 
"movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? 

678 
"paddusb "MANGLE(b01)", %%mm4 \n\t" 

679 
"pcmpgtb %%mm3, %%mm4 \n\t" // menergy/8 < QP 

680 
"psubusb %%mm1, %%mm3 \n\t" // d=menergy/8MIN(lenergy,renergy)/8 

681 
"pand %%mm4, %%mm3 \n\t" 

682  
683 
"movq %%mm3, %%mm1 \n\t" 

684 
// "psubusb "MANGLE(b01)", %%mm3 \n\t" 

685 
PAVGB(%%mm7, %%mm3) 

686 
PAVGB(%%mm7, %%mm3) 

687 
"paddusb %%mm1, %%mm3 \n\t" 

688 
// "paddusb "MANGLE(b01)", %%mm3 \n\t" 

689  
690 
"movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 

691 
"movq (%0, %1, 4), %%mm5 \n\t" //l4 

692 
"movq (%0, %1, 4), %%mm4 \n\t" //l4 

693 
"psubusb %%mm6, %%mm5 \n\t" 

694 
"psubusb %%mm4, %%mm6 \n\t" 

695 
"por %%mm6, %%mm5 \n\t" // l3l4 

696 
"pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3l4) 

697 
"pxor %%mm6, %%mm0 \n\t" 

698 
"pand %%mm0, %%mm3 \n\t" 

699 
PMINUB(%%mm5, %%mm3, %%mm0) 

700  
701 
"psubusb "MANGLE(b01)", %%mm3 \n\t" 

702 
PAVGB(%%mm7, %%mm3) 

703  
704 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" 

705 
"movq (%0, %1, 4), %%mm2 \n\t" 

706 
"pxor %%mm6, %%mm0 \n\t" 

707 
"pxor %%mm6, %%mm2 \n\t" 

708 
"psubb %%mm3, %%mm0 \n\t" 

709 
"paddb %%mm3, %%mm2 \n\t" 

710 
"pxor %%mm6, %%mm0 \n\t" 

711 
"pxor %%mm6, %%mm2 \n\t" 

712 
"movq %%mm0, (%%"REG_a", %1, 2) \n\t" 

713 
"movq %%mm2, (%0, %1, 4) \n\t" 

721  714 
#endif //0 
722  715  
723 
"lea (%0, %1), %%"REG_a" \n\t"


724 
"pcmpeqb %%mm6, %%mm6 \n\t" // 1


716 
"lea (%0, %1), %%"REG_a" \n\t" 

717 
"pcmpeqb %%mm6, %%mm6 \n\t" // 1 

725  718 
// 0 1 2 3 4 5 6 7 
726  719 
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 
727  720 
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 
728  721  
729  722  
730 
"movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3


731 
"movq (%0, %1, 4), %%mm0 \n\t" // l4


732 
"pxor %%mm6, %%mm1 \n\t" // l31


733 
PAVGB(%%mm1, %%mm0) // q+128 = (l4l3+256)/2


723 
"movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 

724 
"movq (%0, %1, 4), %%mm0 \n\t" // l4 

725 
"pxor %%mm6, %%mm1 \n\t" // l31 

726 
PAVGB(%%mm1, %%mm0) // q+128 = (l4l3+256)/2 

734  727 
// mm1=l31, mm0=128q 
735  728  
736 
"movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5


737 
"movq (%%"REG_a", %1), %%mm3 \n\t" // l2


738 
"pxor %%mm6, %%mm2 \n\t" // l51


739 
"movq %%mm2, %%mm5 \n\t" // l51


740 
"movq "MANGLE(b80)", %%mm4 \n\t" // 128


741 
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"


742 
PAVGB(%%mm3, %%mm2) // (l2l5+256)/2


743 
PAVGB(%%mm0, %%mm4) // ~(l4l3)/4 + 128


744 
PAVGB(%%mm2, %%mm4) // ~(l2l5)/4 +(l4l3)/8 + 128


745 
PAVGB(%%mm0, %%mm4) // ~(l2l5)/8 +5(l4l3)/16 + 128


729 
"movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 

730 
"movq (%%"REG_a", %1), %%mm3 \n\t" // l2 

731 
"pxor %%mm6, %%mm2 \n\t" // l51 

732 
"movq %%mm2, %%mm5 \n\t" // l51 

733 
"movq "MANGLE(b80)", %%mm4 \n\t" // 128 

734 
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 

735 
PAVGB(%%mm3, %%mm2) // (l2l5+256)/2 

736 
PAVGB(%%mm0, %%mm4) // ~(l4l3)/4 + 128 

737 
PAVGB(%%mm2, %%mm4) // ~(l2l5)/4 +(l4l3)/8 + 128 

738 
PAVGB(%%mm0, %%mm4) // ~(l2l5)/8 +5(l4l3)/16 + 128 

746  739 
// mm1=l31, mm0=128q, mm3=l2, mm4=menergy/16 + 128, mm5= l51 
747  740  
748 
"movq (%%"REG_a"), %%mm2 \n\t" // l1


749 
"pxor %%mm6, %%mm2 \n\t" // l11


750 
PAVGB(%%mm3, %%mm2) // (l2l1+256)/2


751 
PAVGB((%0), %%mm1) // (l0l3+256)/2


752 
"movq "MANGLE(b80)", %%mm3 \n\t" // 128


753 
PAVGB(%%mm2, %%mm3) // ~(l2l1)/4 + 128


754 
PAVGB(%%mm1, %%mm3) // ~(l0l3)/4 +(l2l1)/8 + 128


755 
PAVGB(%%mm2, %%mm3) // ~(l0l3)/8 +5(l2l1)/16 + 128


741 
"movq (%%"REG_a"), %%mm2 \n\t" // l1 

742 
"pxor %%mm6, %%mm2 \n\t" // l11 

743 
PAVGB(%%mm3, %%mm2) // (l2l1+256)/2 

744 
PAVGB((%0), %%mm1) // (l0l3+256)/2 

745 
"movq "MANGLE(b80)", %%mm3 \n\t" // 128 

746 
PAVGB(%%mm2, %%mm3) // ~(l2l1)/4 + 128 

747 
PAVGB(%%mm1, %%mm3) // ~(l0l3)/4 +(l2l1)/8 + 128 

748 
PAVGB(%%mm2, %%mm3) // ~(l0l3)/8 +5(l2l1)/16 + 128 

756  749 
// mm0=128q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= l51 
757  750  
758 
PAVGB((%%REGc, %1), %%mm5) // (l6l5+256)/2


759 
"movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7


760 
"pxor %%mm6, %%mm1 \n\t" // l71


761 
PAVGB((%0, %1, 4), %%mm1) // (l4l7+256)/2


762 
"movq "MANGLE(b80)", %%mm2 \n\t" // 128


763 
PAVGB(%%mm5, %%mm2) // ~(l6l5)/4 + 128


764 
PAVGB(%%mm1, %%mm2) // ~(l4l7)/4 +(l6l5)/8 + 128


765 
PAVGB(%%mm5, %%mm2) // ~(l4l7)/8 +5(l6l5)/16 + 128


751 
PAVGB((%%REGc, %1), %%mm5) // (l6l5+256)/2 

752 
"movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 

753 
"pxor %%mm6, %%mm1 \n\t" // l71 

754 
PAVGB((%0, %1, 4), %%mm1) // (l4l7+256)/2 

755 
"movq "MANGLE(b80)", %%mm2 \n\t" // 128 

756 
PAVGB(%%mm5, %%mm2) // ~(l6l5)/4 + 128 

757 
PAVGB(%%mm1, %%mm2) // ~(l4l7)/4 +(l6l5)/8 + 128 

758 
PAVGB(%%mm5, %%mm2) // ~(l4l7)/8 +5(l6l5)/16 + 128 

766  759 
// mm0=128q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 
767  760  
768 
"movq "MANGLE(b00)", %%mm1 \n\t" // 0


769 
"movq "MANGLE(b00)", %%mm5 \n\t" // 0


770 
"psubb %%mm2, %%mm1 \n\t" // 128  renergy/16


771 
"psubb %%mm3, %%mm5 \n\t" // 128  lenergy/16


772 
PMAXUB(%%mm1, %%mm2) // 128 + renergy/16


773 
PMAXUB(%%mm5, %%mm3) // 128 + lenergy/16


774 
PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(lenergy,renergy)/16


761 
"movq "MANGLE(b00)", %%mm1 \n\t" // 0 

762 
"movq "MANGLE(b00)", %%mm5 \n\t" // 0 

763 
"psubb %%mm2, %%mm1 \n\t" // 128  renergy/16 

764 
"psubb %%mm3, %%mm5 \n\t" // 128  lenergy/16 

765 
PMAXUB(%%mm1, %%mm2) // 128 + renergy/16 

766 
PMAXUB(%%mm5, %%mm3) // 128 + lenergy/16


767 
PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(lenergy,renergy)/16 

775  768  
776  769 
// mm0=128q, mm3=128 + MIN(lenergy,renergy)/16, mm4= menergy/16 + 128 
777  770  
778 
"movq "MANGLE(b00)", %%mm7 \n\t" // 0


779 
"movq %2, %%mm2 \n\t" // QP


780 
PAVGB(%%mm6, %%mm2) // 128 + QP/2


781 
"psubb %%mm6, %%mm2 \n\t"


782  
783 
"movq %%mm4, %%mm1 \n\t"


784 
"pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)


785 
"pxor %%mm1, %%mm4 \n\t"


786 
"psubb %%mm1, %%mm4 \n\t" // 128 + menergy/16


787 
"pcmpgtb %%mm4, %%mm2 \n\t" // menergy/16 < QP/2


788 
"psubusb %%mm3, %%mm4 \n\t" //d=menergy/16  MIN(lenergy,renergy)/16


771 
"movq "MANGLE(b00)", %%mm7 \n\t" // 0 

772 
"movq %2, %%mm2 \n\t" // QP 

773 
PAVGB(%%mm6, %%mm2) // 128 + QP/2 

774 
"psubb %%mm6, %%mm2 \n\t" 

775  
776 
"movq %%mm4, %%mm1 \n\t" 

777 
"pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) 

778 
"pxor %%mm1, %%mm4 \n\t" 

779 
"psubb %%mm1, %%mm4 \n\t" // 128 + menergy/16 

780 
"pcmpgtb %%mm4, %%mm2 \n\t" // menergy/16 < QP/2 

781 
"psubusb %%mm3, %%mm4 \n\t" //d=menergy/16  MIN(lenergy,renergy)/16 

789  782 
// mm0=128q, mm1= SIGN(menergy), mm2= menergy/16 < QP/2, mm4= d/16 
790  783  
791 
"movq %%mm4, %%mm3 \n\t" // d


792 
"psubusb "MANGLE(b01)", %%mm4 \n\t"


793 
PAVGB(%%mm7, %%mm4) // d/32


794 
PAVGB(%%mm7, %%mm4) // (d + 32)/64


795 
"paddb %%mm3, %%mm4 \n\t" // 5d/64


796 
"pand %%mm2, %%mm4 \n\t"


797  
798 
"movq "MANGLE(b80)", %%mm5 \n\t" // 128


799 
"psubb %%mm0, %%mm5 \n\t" // q


800 
"paddsb %%mm6, %%mm5 \n\t" // fix bad rounding


801 
"pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)


802 
"pxor %%mm7, %%mm5 \n\t"


803  
804 
PMINUB(%%mm5, %%mm4, %%mm3) // MIN(q, 5d/64)


805 
"pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)


806  
807 
"pand %%mm7, %%mm4 \n\t"


808 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t"


809 
"movq (%0, %1, 4), %%mm2 \n\t"


810 
"pxor %%mm1, %%mm0 \n\t"


811 
"pxor %%mm1, %%mm2 \n\t"


812 
"paddb %%mm4, %%mm0 \n\t"


813 
"psubb %%mm4, %%mm2 \n\t"


814 
"pxor %%mm1, %%mm0 \n\t"


815 
"pxor %%mm1, %%mm2 \n\t"


816 
"movq %%mm0, (%%"REG_a", %1, 2) \n\t"


817 
"movq %%mm2, (%0, %1, 4) \n\t"


818  
819 
:


820 
: "r" (src), "r" ((long)stride), "m" (c>pQPb)


821 
: "%"REG_a, "%"REG_c


822 
);


784 
"movq %%mm4, %%mm3 \n\t" // d 

785 
"psubusb "MANGLE(b01)", %%mm4 \n\t" 

786 
PAVGB(%%mm7, %%mm4) // d/32 

787 
PAVGB(%%mm7, %%mm4) // (d + 32)/64 

788 
"paddb %%mm3, %%mm4 \n\t" // 5d/64 

789 
"pand %%mm2, %%mm4 \n\t" 

790  
791 
"movq "MANGLE(b80)", %%mm5 \n\t" // 128 

792 
"psubb %%mm0, %%mm5 \n\t" // q 

793 
"paddsb %%mm6, %%mm5 \n\t" // fix bad rounding 

794 
"pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) 

795 
"pxor %%mm7, %%mm5 \n\t" 

796  
797 
PMINUB(%%mm5, %%mm4, %%mm3) // MIN(q, 5d/64) 

798 
"pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) 

799  
800 
"pand %%mm7, %%mm4 \n\t" 

801 
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" 

802 
"movq (%0, %1, 4), %%mm2 \n\t" 

803 
"pxor %%mm1, %%mm0 \n\t" 

804 
"pxor %%mm1, %%mm2 \n\t" 

805 
"paddb %%mm4, %%mm0 \n\t" 

806 
"psubb %%mm4, %%mm2 \n\t" 

807 
"pxor %%mm1, %%mm0 \n\t" 

808 
"pxor %%mm1, %%mm2 \n\t" 

809 
"movq %%mm0, (%%"REG_a", %1, 2) \n\t" 

810 
"movq %%mm2, (%0, %1, 4) \n\t" 

811  
812 
: 

813 
: "r" (src), "r" ((long)stride), "m" (c>pQPb) 

814 
: "%"REG_a, "%"REG_c 

815 
); 

823  816  
824  817 
/* 
825 
{ 

826 
int x; 

827 
src= stride; 

828 
for(x=0; x<BLOCK_SIZE; x++) 

829 
{ 

830 
const int middleEnergy= 5*(src[l5]  src[l4]) + 2*(src[l3]  src[l6]); 

831 
if(FFABS(middleEnergy)< 8*QP) 

832 
{ 

833 
const int q=(src[l4]  src[l5])/2; 

834 
const int leftEnergy= 5*(src[l3]  src[l2]) + 2*(src[l1]  src[l4]); 

835 
const int rightEnergy= 5*(src[l7]  src[l6]) + 2*(src[l5]  src[l8]); 

836  
837 
int d= FFABS(middleEnergy)  FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 

838 
d= FFMAX(d, 0); 

839  
840 
d= (5*d + 32) >> 6; 

841 
d*= FFSIGN(middleEnergy); 

842  
843 
if(q>0) 

844 
{ 

845 
d= d<0 ? 0 : d; 

846 
d= d>q ? q : d; 

847 
} 

848 
else 

849 
{ 

850 
d= d>0 ? 0 : d; 

851 
d= d<q ? q : d; 

852 
} 

853  
854 
src[l4]= d; 

855 
src[l5]+= d; 

856 
} 

857 
src++; 

818 
{ 

819 
int x; 

820 
src= stride; 

821 
for(x=0; x<BLOCK_SIZE; x++){ 

822 
const int middleEnergy= 5*(src[l5]  src[l4]) + 2*(src[l3]  src[l6]); 

823 
if(FFABS(middleEnergy)< 8*QP){ 

824 
const int q=(src[l4]  src[l5])/2; 

825 
const int leftEnergy= 5*(src[l3]  src[l2]) + 2*(src[l1]  src[l4]); 

826 
const int rightEnergy= 5*(src[l7]  src[l6]) + 2*(src[l5]  src[l8]); 

827  
828 
int d= FFABS(middleEnergy)  FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 

829 
d= FFMAX(d, 0); 

830  
831 
d= (5*d + 32) >> 6; 

832 
d*= FFSIGN(middleEnergy); 

833  
834 
if(q>0){ 

835 
d= d<0 ? 0 : d; 

836 
d= d>q ? q : d; 

837 
}else{ 

838 
d= d>0 ? 0 : d; 

839 
d= d<q ? q : d; 

840 
} 

841  
842 
src[l4]= d; 

843 
src[l5]+= d; 

858  844 
} 
859 
src=8; 

860 
for(x=0; x<8; x++) 

861 
{ 

862 
int y; 

863 
for(y=4; y<6; y++) 

864 
{ 

865 
int d= src[x+y*stride]  tmp[x+(y4)*8]; 

866 
int ad= FFABS(d); 

867 
static int max=0; 

868 
static int sum=0; 

869 
static int num=0; 

870 
static int bias=0; 

871  
872 
if(max<ad) max=ad; 

873 
sum+= ad>3 ? 1 : 0; 

874 
if(ad>3) 

875 
{ 

876 
src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; 

877 
} 

878 
if(y==4) bias+=d; 

879 
num++; 

880 
if(num%1000000 == 0) 
Also available in: Unified diff